def testSimple(self): converter = data_converter.AsNStepTransition(self._data_context, gamma=0.5) transition = tensor_spec.sample_spec_nest( self._data_context.transition_spec, outer_dims=[2]) converted = converter(transition) (transition, converted) = self.evaluate((transition, converted)) tf.nest.map_structure(self.assertAllEqual, converted, transition)
def testTrajectoryNotSingleStepTransition(self): converter = data_converter.AsNStepTransition(self._data_context, gamma=0.5) traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec, outer_dims=[2, 3]) converted = converter(traj) expected = trajectory.to_n_step_transition(traj, gamma=0.5) (expected, converted) = self.evaluate((expected, converted)) tf.nest.map_structure(self.assertAllEqual, converted, expected)
def testTrajectoryInvalidTimeDimensionRaises(self): converter = data_converter.AsNStepTransition(self._data_context, gamma=0.5, n=4) traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec, outer_dims=[2, 3]) with self.assertRaisesRegex( ValueError, r'has a time axis dim value \'3\' vs the expected \'5\''): converter(traj)
def _setup_data_converter(self, q_network, gamma, n_step_update): if q_network.state_spec: # AsNStepTransition does not support emitting [B, T, ...] tensors, # which we need for DQN-RNN. self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=False) else: # This reduces the n-step return and removes the extra time dimension, # allowing the rest of the computations to be independent of the # n-step parameter. self._as_transition = data_converter.AsNStepTransition( self.data_context, gamma=gamma, n=n_step_update)
def testPrunes(self): converter = data_converter.AsNStepTransition(self._data_context, gamma=0.5) my_spec = self._data_context.transition_spec.replace( action_step=self._data_context.transition_spec.action_step.replace( action={ 'action1': tf.TensorSpec((), tf.float32), 'action2': tf.TensorSpec([4], tf.int32) })) transition = tensor_spec.sample_spec_nest(my_spec, outer_dims=[2]) converted = converter(transition) expected = tf.nest.map_structure(lambda x: x, transition) del expected.action_step.action['action2'] (expected, converted) = self.evaluate((expected, converted)) tf.nest.map_structure(self.assertAllEqual, converted, expected)
def _setup_data_converter(self, q_network, gamma, n_step_update): if q_network.state_spec: if not self._in_graph_bellman_update: self._data_context = data_converter.DataContext( time_step_spec=self._time_step_spec, action_spec=self._action_spec, info_spec=self._collect_policy.info_spec, policy_state_spec=self._q_network.state_spec, use_half_transition=True) self._as_transition = data_converter.AsHalfTransition( self.data_context, squeeze_time_dim=False) else: self._data_context = data_converter.DataContext( time_step_spec=self._time_step_spec, action_spec=self._action_spec, info_spec=self._collect_policy.info_spec, policy_state_spec=self._q_network.state_spec, use_half_transition=False) self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=False, prepend_t0_to_next_time_step=True) else: if not self._in_graph_bellman_update: self._data_context = data_converter.DataContext( time_step_spec=self._time_step_spec, action_spec=self._action_spec, info_spec=self._collect_policy.info_spec, policy_state_spec=self._q_network.state_spec, use_half_transition=True) self._as_transition = data_converter.AsHalfTransition( self.data_context, squeeze_time_dim=True) else: # This reduces the n-step return and removes the extra time dimension, # allowing the rest of the computations to be independent of the # n-step parameter. self._as_transition = data_converter.AsNStepTransition( self.data_context, gamma=gamma, n=n_step_update)
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, optimizer: types.Optimizer, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, epsilon_greedy: types.Float = 0.1, n_step_update: int = 1, boltzmann_temperature: Optional[types.Int] = None, emit_log_probability: bool = False, # Params for target network updates target_q_network: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: int = 1, # Params for training. td_errors_loss_fn: Optional[types.LossFn] = None, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, gradient_clipping: Optional[types.Float] = None, # Params for debugging debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None, entropy_tau: types.Float = 0.9, alpha: types.Float = 0.3): tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) if epsilon_greedy is not None and boltzmann_temperature is not None: raise ValueError( 'Configured both epsilon_greedy value {} and temperature {}, ' 'however only one of them can be used for exploration.'.format( epsilon_greedy, boltzmann_temperature)) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._q_network = q_network net_observation_spec = time_step_spec.observation if observation_and_action_constraint_splitter: net_observation_spec, _ = observation_and_action_constraint_splitter( net_observation_spec) q_network.create_variables(net_observation_spec) if target_q_network: target_q_network.create_variables(net_observation_spec) self._target_q_network = common.maybe_copy_target_network_with_checks( self._q_network, target_q_network, input_spec=net_observation_spec, name='TargetQNetwork') self._check_network_output(self._q_network, 'q_network') self._check_network_output(self._target_q_network, 'target_q_network') self._epsilon_greedy = epsilon_greedy self._n_step_update = n_step_update self._boltzmann_temperature = boltzmann_temperature self._optimizer = optimizer self._td_errors_loss_fn = (td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater(target_update_tau, target_update_period) self.entropy_tau = entropy_tau self.alpha = alpha policy, collect_policy = self._setup_policy(time_step_spec, action_spec, boltzmann_temperature, emit_log_probability) if q_network.state_spec and n_step_update != 1: raise NotImplementedError( 'DqnAgent does not currently support n-step updates with stateful ' 'networks (i.e., RNNs), but n_step_update = {}'.format( n_step_update)) train_sequence_length = (n_step_update + 1 if not q_network.state_spec else None) super(dqn_agent.DqnAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False, ) if q_network.state_spec: # AsNStepTransition does not support emitting [B, T, ...] tensors, # which we need for DQN-RNN. self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=False) else: # This reduces the n-step return and removes the extra time dimension, # allowing the rest of the computations to be independent of the # n-step parameter. self._as_transition = data_converter.AsNStepTransition( self.data_context, gamma=gamma, n=n_step_update)
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, q_network: network.Network, optimizer: types.Optimizer, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None, epsilon_greedy: types.Float = 0.1, n_step_update: int = 1, boltzmann_temperature: Optional[types.Int] = None, emit_log_probability: bool = False, # Params for target network updates target_q_network: Optional[network.Network] = None, target_update_tau: types.Float = 1.0, target_update_period: int = 1, # Params for training. td_errors_loss_fn: Optional[types.LossFn] = None, gamma: types.Float = 1.0, reward_scale_factor: types.Float = 1.0, gradient_clipping: Optional[types.Float] = None, # Params for debugging debug_summaries: bool = False, summarize_grads_and_vars: bool = False, train_step_counter: Optional[tf.Variable] = None, name: Optional[Text] = None): """Creates a DQN Agent. Args: time_step_spec: A `TimeStep` spec of the expected time_steps. action_spec: A nest of BoundedTensorSpec representing the actions. q_network: A `tf_agents.network.Network` to be used by the agent. The network will be called with `call(observation, step_type)` and should emit logits over the action space. optimizer: The optimizer to use for training. observation_and_action_constraint_splitter: A function used to process observations with action constraints. These constraints can indicate, for example, a mask of valid/invalid actions for a given state of the environment. The function takes in a full observation and returns a tuple consisting of 1) the part of the observation intended as input to the network and 2) the constraint. An example `observation_and_action_constraint_splitter` could be as simple as: ``` def observation_and_action_constraint_splitter(observation): return observation['network_input'], observation['constraint'] ``` *Note*: when using `observation_and_action_constraint_splitter`, make sure the provided `q_network` is compatible with the network-specific half of the output of the `observation_and_action_constraint_splitter`. In particular, `observation_and_action_constraint_splitter` will be called on the observation before passing to the network. If `observation_and_action_constraint_splitter` is None, action constraints are not applied. epsilon_greedy: probability of choosing a random action in the default epsilon-greedy collect policy (used only if a wrapper is not provided to the collect_policy method). n_step_update: The number of steps to consider when computing TD error and TD loss. Defaults to single-step updates. Note that this requires the user to call train on Trajectory objects with a time dimension of `n_step_update + 1`. However, note that we do not yet support `n_step_update > 1` in the case of RNNs (i.e., non-empty `q_network.state_spec`). boltzmann_temperature: Temperature value to use for Boltzmann sampling of the actions during data collection. The closer to 0.0, the higher the probability of choosing the best action. emit_log_probability: Whether policies emit log probabilities or not. target_q_network: (Optional.) A `tf_agents.network.Network` to be used as the target network during Q learning. Every `target_update_period` train steps, the weights from `q_network` are copied (possibly with smoothing via `target_update_tau`) to `target_q_network`. If `target_q_network` is not provided, it is created by making a copy of `q_network`, which initializes a new network with the same structure and its own layers and weights. Network copying is performed via the `Network.copy` superclass method, and may inadvertently lead to the resulting network to share weights with the original. This can happen if, for example, the original network accepted a pre-built Keras layer in its `__init__`, or accepted a Keras layer that wasn't built, but neglected to create a new copy. In these cases, it is up to you to provide a target Network having weights that are not shared with the original `q_network`. If you provide a `target_q_network` that shares any weights with `q_network`, a warning will be logged but no exception is thrown. Note; shallow copies of Keras layers may be built via the code: ```python new_layer = type(layer).from_config(layer.get_config()) ``` target_update_tau: Factor for soft update of the target networks. target_update_period: Period for soft update of the target networks. td_errors_loss_fn: A function for computing the TD errors loss. If None, a default value of element_wise_huber_loss is used. This function takes as input the target and the estimated Q values and returns the loss for each element of the batch. gamma: A discount factor for future rewards. reward_scale_factor: Multiplicative scale for the reward. gradient_clipping: Norm length to clip gradients. debug_summaries: A bool to gather debug summaries. summarize_grads_and_vars: If True, gradient and network variable summaries will be written during training. train_step_counter: An optional counter to increment every time the train op is run. Defaults to the global_step. name: The name of this agent. All variables in this module will fall under that name. Defaults to the class name. Raises: ValueError: If `action_spec` contains more than one action or action spec minimum is not equal to 0. ValueError: If the q networks do not emit floating point outputs with inner shape matching `action_spec`. NotImplementedError: If `q_network` has non-empty `state_spec` (i.e., an RNN is provided) and `n_step_update > 1`. """ tf.Module.__init__(self, name=name) self._check_action_spec(action_spec) if epsilon_greedy is not None and boltzmann_temperature is not None: raise ValueError( 'Configured both epsilon_greedy value {} and temperature {}, ' 'however only one of them can be used for exploration.'.format( epsilon_greedy, boltzmann_temperature)) self._observation_and_action_constraint_splitter = ( observation_and_action_constraint_splitter) self._q_network = q_network net_observation_spec = time_step_spec.observation if observation_and_action_constraint_splitter: net_observation_spec, _ = observation_and_action_constraint_splitter( net_observation_spec) q_network.create_variables(net_observation_spec) if target_q_network: target_q_network.create_variables(net_observation_spec) self._target_q_network = common.maybe_copy_target_network_with_checks( self._q_network, target_q_network, input_spec=net_observation_spec, name='TargetQNetwork') self._check_network_output(self._q_network, 'q_network') self._check_network_output(self._target_q_network, 'target_q_network') self._epsilon_greedy = epsilon_greedy self._n_step_update = n_step_update self._boltzmann_temperature = boltzmann_temperature self._optimizer = optimizer self._td_errors_loss_fn = ( td_errors_loss_fn or common.element_wise_huber_loss) self._gamma = gamma self._reward_scale_factor = reward_scale_factor self._gradient_clipping = gradient_clipping self._update_target = self._get_target_updater( target_update_tau, target_update_period) policy, collect_policy = self._setup_policy(time_step_spec, action_spec, boltzmann_temperature, emit_log_probability) if q_network.state_spec and n_step_update != 1: raise NotImplementedError( 'DqnAgent does not currently support n-step updates with stateful ' 'networks (i.e., RNNs), but n_step_update = {}'.format(n_step_update)) train_sequence_length = ( n_step_update + 1 if not q_network.state_spec else None) super(DqnAgent, self).__init__( time_step_spec, action_spec, policy, collect_policy, train_sequence_length=train_sequence_length, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=train_step_counter, validate_args=False, ) if q_network.state_spec: # AsNStepTransition does not support emitting [B, T, ...] tensors, # which we need for DQN-RNN. self._as_transition = data_converter.AsTransition( self.data_context, squeeze_time_dim=False) else: # This reduces the n-step return and removes the extra time dimension, # allowing the rest of the computations to be independent of the # n-step parameter. self._as_transition = data_converter.AsNStepTransition( self.data_context, gamma=gamma, n=n_step_update)