def zeros_from_spec(nested_spec, batch_size): """Create nested zero Tensors or Distributions. A zero tensor with shape[0]=`batch_size is created for each TensorSpec and A distribution with all the parameters as zero Tensors is created for each DistributionSpec. Args: nested_spec (nested TensorSpec or DistributionSpec): batch_size (int): batch size added as the first dimension to the shapes in TensorSpec Returns: nested Tensor or Distribution """ def _zero_tensor(spec): if batch_size is None: shape = spec.shape else: spec_shape = tf.convert_to_tensor(value=spec.shape, dtype=tf.int32) shape = tf.concat(([batch_size], spec_shape), axis=0) dtype = spec.dtype return tf.zeros(shape, dtype) param_spec = nest_utils.to_distribution_param_spec(nested_spec) params = tf.nest.map_structure(_zero_tensor, param_spec) return nest_utils.params_to_distributions(params, nested_spec)
def rollout(self, max_num_steps, time_step, policy_state): counter = tf.zeros((), tf.int32) batch_size = self._env.batch_size maximum_iterations = math.ceil(max_num_steps / self._env.batch_size) def create_ta(s): return tf.TensorArray(dtype=s.dtype, size=maximum_iterations, element_shape=tf.TensorShape( [batch_size]).concatenate(s.shape)) training_info_ta = tf.nest.map_structure( create_ta, self._training_info_spec._replace( rollout_info=nest_utils.to_distribution_param_spec( self._training_info_spec.rollout_info))) [counter, time_step, policy_state, training_info_ta] = tf.while_loop( cond=lambda *_: True, body=self._rollout_loop_body, loop_vars=[counter, time_step, policy_state, training_info_ta], maximum_iterations=maximum_iterations, back_prop=False, name="rollout_loop") training_info = tf.nest.map_structure(lambda ta: ta.stack(), training_info_ta) training_info = nest_utils.params_to_distributions( training_info, self._training_info_spec) self._algorithm.summarize_rollout(training_info) self._algorithm.summarize_metrics() return time_step, policy_state
def _iter(self, time_step, policy_state): """One training iteration.""" counter = tf.zeros((), tf.int32) batch_size = self._env.batch_size def create_ta(s): return tf.TensorArray(dtype=s.dtype, size=self._train_interval, element_shape=tf.TensorShape( [batch_size]).concatenate(s.shape)) training_info_ta = tf.nest.map_structure( create_ta, self._training_info_spec._replace( info=nest_utils.to_distribution_param_spec( self._training_info_spec.info))) with tf.GradientTape(watch_accessed_variables=False, persistent=True) as tape: tape.watch(self._trainable_variables) [counter, next_time_step, next_state, training_info_ta ] = tf.while_loop(cond=lambda *_: True, body=self._train_loop_body, loop_vars=[ counter, time_step, policy_state, training_info_ta ], back_prop=True, parallel_iterations=1, maximum_iterations=self._train_interval, name='iter_loop') training_info = tf.nest.map_structure(lambda ta: ta.stack(), training_info_ta) training_info = nest_utils.params_to_distributions( training_info, self._training_info_spec) loss_info, grads_and_vars = self._algorithm.train_complete( tape, training_info) del tape self._algorithm.summarize_train(training_info, loss_info, grads_and_vars) self._algorithm.summarize_metrics() common.get_global_counter().assign_add(1) return [next_time_step, next_state]
def set_exp_replayer(self, exp_replayer: str, num_envs): """Set experience replayer. Args: exp_replayer (str): type of experience replayer. One of ("one_time", "uniform") num_envs (int): the total number of environments from all batched environments. """ if exp_replayer == "one_time": self._exp_replayer = OnetimeExperienceReplayer() elif exp_replayer == "uniform": exp_spec = nest_utils.to_distribution_param_spec( self.experience_spec) self._exp_replayer = SyncUniformExperienceReplayer( exp_spec, num_envs) else: raise ValueError("invalid experience replayer name") self.add_experience_observer(self._exp_replayer.observe)
def set_exp_replayer(self, exp_replayer: str, num_envs, num_actors=0, unroll_length=0, learn_queue_cap=0): """Set experience replayer. Args: exp_replayer (str): type of experience replayer. One of ("one_time", "uniform", "cycle_one_time") num_envs (int): the total number of environments from all batched environments/actors, which is num_actors * batch_size. num_actors (int): number of async actors, required to be positive for cycle_one_time replayer. unroll_length (int): number of env steps to unroll. Used in cycle_one_time replayer. learn_queue_cap (int): number of actors to use for each mini-batch. """ if exp_replayer == "one_time": self._exp_replayer = OnetimeExperienceReplayer() else: exp_spec = nest_utils.to_distribution_param_spec( self.experience_spec) if exp_replayer == "uniform": self._exp_replayer = SyncUniformExperienceReplayer( exp_spec, num_envs) elif exp_replayer == "cycle_one_time": assert num_actors > 0 assert unroll_length > 0 self._exp_replayer = CyclicOneTimeExperienceReplayer( exp_spec, num_envs, num_actors, unroll_length, learn_queue_cap) else: raise ValueError("invalid experience replayer name") self.add_experience_observer(self._exp_replayer.observe)
def _update(self, experience, weight): batch_size = tf.shape(experience.step_type)[1] counter = tf.zeros((), tf.int32) initial_train_state = common.get_initial_policy_state( batch_size, self.train_state_spec) if self._use_rollout_state: first_train_state = tf.nest.map_structure( lambda state: state[0, ...], experience.state) else: first_train_state = initial_train_state num_steps = tf.shape(experience.step_type)[0] def create_ta(s): # TensorArray cannot use Tensor (batch_size) as element_shape ta_batch_size = experience.step_type.shape[1] return tf.TensorArray(dtype=s.dtype, size=num_steps, element_shape=tf.TensorShape( [ta_batch_size]).concatenate(s.shape)) experience_ta = tf.nest.map_structure( create_ta, nest_utils.to_distribution_param_spec( self.processed_experience_spec)) experience_ta = tf.nest.map_structure( lambda elem, ta: ta.unstack(elem), experience, experience_ta) info_ta = tf.nest.map_structure( create_ta, nest_utils.to_distribution_param_spec(self.train_step_info_spec)) scope = get_current_scope() def _train_loop_body(counter, policy_state, info_ta): exp = tf.nest.map_structure(lambda ta: ta.read(counter), experience_ta) exp = nest_utils.params_to_distributions( exp, self.processed_experience_spec) policy_state = common.reset_state_if_necessary( policy_state, initial_train_state, tf.equal(exp.step_type, StepType.FIRST)) with tf.name_scope(scope): policy_step = self.train_step(exp, policy_state) info_ta = tf.nest.map_structure( lambda ta, x: ta.write(counter, x), info_ta, nest_utils.distributions_to_params(policy_step.info)) counter += 1 return [counter, policy_step.state, info_ta] with tf.GradientTape(persistent=True, watch_accessed_variables=False) as tape: tape.watch(self.trainable_variables) [_, _, info_ta] = tf.while_loop( cond=lambda counter, *_: tf.less(counter, num_steps), body=_train_loop_body, loop_vars=[counter, first_train_state, info_ta], back_prop=True, name="train_loop") info = tf.nest.map_structure(lambda ta: ta.stack(), info_ta) info = nest_utils.params_to_distributions( info, self.train_step_info_spec) experience = nest_utils.params_to_distributions( experience, self.processed_experience_spec) training_info = TrainingInfo(action=experience.action, reward=experience.reward, discount=experience.discount, step_type=experience.step_type, rollout_info=experience.rollout_info, info=info, env_id=experience.env_id) loss_info, grads_and_vars = self.train_complete( tape=tape, training_info=training_info, weight=weight) del tape return training_info, loss_info, grads_and_vars
def __init__(self, num_envs, env_batch_size, learn_queue_cap, actor_queue_cap, time_step_spec, policy_step_spec, unroll_length, num_actor_queues=1): """ Create five kinds of queues: 1. one learner queue stores batches of training trajectories all agent threads should enqueue unrolled trajectories into it 2.`num_actor_queues` actor queues each queue stores batches of observations from some envs to act upon all agent threads should enqueue current observations into one of the actor queues to get predicted actions 3. `num_envs` action-returning queues each env holds one such queue for receiving the returned action predicted the by actor 4. one log queue the logging thread retrieves trajectory data from this queue 5. `num_envs` env-unroll queues there is a one-to-one mapping from a queue to an env. Each queue accumulates `unroll_length` time steps before they are used for training. These queues are used for communications between learner&actor threads and actor&logging threads. We manage them in a centralized way to facilitate closing. Args: num_envs (int): number of tf_agents batched environments running in parallel. Each environment could be a batch of environments! env_batch_size (int): number of envs contained by each batched env learn_queue_cap (int): the capacity of the learner queue actor_queue_cap (int): the capacity of a actor queue time_step_spec (tf.nest): see OffPolicyAsyncDriver._prepare_specs(); used for creating queues policy_step_spec (tf.nest): see OffPolicyAsyncDriver._prepare_specs(); used for creating queues unroll_length (int): how many time steps each environment proceeds before training num_actor_queues (int): number of actor queues running in parallel """ batch_time_step_spec = repeat_shape_n(time_step_spec, env_batch_size) batch_policy_step_spec = repeat_shape_n( nest_utils.to_distribution_param_spec(policy_step_spec), env_batch_size) unrolled_time_step_spec = repeat_shape_n(batch_time_step_spec, unroll_length) unrolled_policy_step_spec = repeat_shape_n(batch_policy_step_spec, unroll_length) self._batch_state_spec = batch_policy_step_spec.state self.learn_queue = NestFIFOQueue( capacity=learn_queue_cap, sample_element=LearningBatch( time_step=unrolled_time_step_spec, state=unrolled_policy_step_spec.state, policy_step=unrolled_policy_step_spec, next_time_step=unrolled_time_step_spec)) self.log_queue = NestFIFOQueue(capacity=num_envs, sample_element=[ unrolled_time_step_spec, unrolled_policy_step_spec, unrolled_time_step_spec, tf.ones((), dtype=tf.int32) ]) tf.debugging.assert_greater_equal(num_envs, num_actor_queues * actor_queue_cap, message="not enough environments!") self.actor_queues = [ NestFIFOQueue(capacity=actor_queue_cap, sample_element=[ batch_time_step_spec, batch_policy_step_spec.state, tf.ones((), dtype=tf.int32) ]) for i in range(num_actor_queues) ] self.action_return_queues = [ NestFIFOQueue(capacity=1, sample_element=batch_policy_step_spec) for i in range(num_envs) ] self.env_unroll_queues = [ NestFIFOQueue(capacity=unroll_length, sample_element=LearningBatch( time_step=batch_time_step_spec, state=batch_policy_step_spec.state, policy_step=batch_policy_step_spec, next_time_step=batch_time_step_spec)) for i in range(num_envs) ]