def signature(cls, environment_spec: specs.EnvironmentSpec, extras_spec: types.NestedSpec = ()): # This function currently assumes that self._discount is a scalar. # If it ever becomes a nested structure and/or a np.ndarray, this method # will need to know its structure / shape. This is because the signature # discount shape is the environment's discount shape and this adder's # discount shape broadcasted together. Also, the reward shape is this # signature discount shape broadcasted together with the environment # reward shape. As long as self._discount is a scalar, it will not affect # either the signature discount shape nor the signature reward shape, so we # can ignore it. rewards_spec, step_discounts_spec = tree_utils.broadcast_structures( environment_spec.rewards, environment_spec.discounts) rewards_spec = tree.map_structure(_broadcast_specs, rewards_spec, step_discounts_spec) step_discounts_spec = tree.map_structure(copy.deepcopy, step_discounts_spec) transition_spec = types.Transition( environment_spec.observations, environment_spec.actions, rewards_spec, step_discounts_spec, environment_spec.observations, # next_observation extras_spec) return tree.map_structure_with_path(base.spec_like_to_tensor_spec, transition_spec)
def _compute_cumulative_quantities( self, history: types.NestedArray ) -> Tuple[types.NestedArray, types.NestedArray]: first_step, *next_steps = tree_utils.unstack_sequence_fields( history, self._n_step) # Give the same tree structure to the n-step return accumulator, # n-step discount accumulator, and self.discount, so that they can be # iterated in parallel using tree.map_structure. (n_step_return, total_discount, self_discount) = tree_utils.broadcast_structures( first_step['reward'], first_step['discount'], self._discount) # Copy total_discount as it is otherwise read-only. total_discount = tree.map_structure(np.copy, total_discount) # Broadcast n_step_return to have the broadcasted shape of # reward * discount. n_step_return = tree.map_structure( lambda r, d: np.copy(np.broadcast_to(r, np.broadcast(r, d).shape)), n_step_return, total_discount) # NOTE: total discount will have one less discount than it does # step.discounts. This is so that when the learner/update uses an additional # discount we don't apply it twice. Inside the following loop we will # apply this right before summing up the n_step_return. for step in next_steps: (step_discount, step_reward, total_discount) = tree_utils.broadcast_structures( step['discount'], step['reward'], total_discount) # Equivalent to: `total_discount *= self._discount`. tree.map_structure(operator.imul, total_discount, self_discount) # Equivalent to: `n_step_return += step.reward * total_discount`. tree.map_structure(lambda nsr, sr, td: operator.iadd(nsr, sr * td), n_step_return, step_reward, total_discount) # Equivalent to: `total_discount *= step.discount`. tree.map_structure(operator.imul, total_discount, step_discount) return n_step_return, total_discount
def signature( cls, environment_spec: mava_specs.EnvironmentSpec, extras_spec: tf.TypeSpec = {}, ) -> tf.TypeSpec: # This function currently assumes that self._discount is a scalar. # If it ever becomes a nested structure and/or a np.ndarray, this method # will need to know its structure / shape. This is because the signature # discount shape is the environment's discount shape and this adder's # discount shape broadcasted together. Also, the reward shape is this # signature discount shape broadcasted together with the environment # reward shape. As long as self._discount is a scalar, it will not affect # either the signature discount shape nor the signature reward shape, so we # can ignore it. agent_specs = environment_spec.get_agent_specs() agents = environment_spec.get_agent_ids() env_extras_spec = environment_spec.get_extra_specs() extras_spec.update(env_extras_spec) obs_specs = {} act_specs = {} reward_specs = {} step_discount_specs = {} for agent in agents: rewards_spec, step_discounts_spec = tree_utils.broadcast_structures( agent_specs[agent].rewards, agent_specs[agent].discounts ) rewards_spec = tree.map_structure( _broadcast_specs, rewards_spec, step_discounts_spec ) step_discounts_spec = tree.map_structure(copy.deepcopy, step_discounts_spec) obs_specs[agent] = agent_specs[agent].observations act_specs[agent] = agent_specs[agent].actions reward_specs[agent] = rewards_spec step_discount_specs[agent] = step_discounts_spec transition_spec = [ obs_specs, act_specs, extras_spec, reward_specs, step_discount_specs, obs_specs, # next_observation extras_spec, ] return tree.map_structure_with_path( base.spec_like_to_tensor_spec, tuple(transition_spec) )
def signature( cls, environment_spec: specs.EnvironmentSpec, extras_spec: tf.TypeSpec = {}, ) -> tf.TypeSpec: """This is a helper method for generating signatures for Reverb tables. Signatures are useful for validating data types and shapes, see Reverb's documentation for details on how they are used. Args: environment_spec: A `specs.EnvironmentSpec` whose fields are nested structures with leaf nodes that have `.shape` and `.dtype` attributes. This should come from the environment that will be used to generate the data inserted into the Reverb table. extras_spec: A nested structure with leaf nodes that have `.shape` and `.dtype` attributes. The structure (and shapes/dtypes) of this must be the same as the `extras` passed into `ReverbAdder.add`. Returns: A `Step` whose leaf nodes are `tf.TensorSpec` objects. """ agent_specs = environment_spec.get_agent_specs() agents = environment_spec.get_agent_ids() env_extras_spec = environment_spec.get_extra_specs() extras_spec.update(env_extras_spec) obs_specs = {} act_specs = {} reward_specs = {} step_discount_specs = {} for agent in agents: rewards_spec, step_discounts_spec = tree_utils.broadcast_structures( agent_specs[agent].rewards, agent_specs[agent].discounts) obs_specs[agent] = agent_specs[agent].observations act_specs[agent] = agent_specs[agent].actions reward_specs[agent] = rewards_spec step_discount_specs[agent] = step_discounts_spec spec_step = base.Step( observations=obs_specs, actions=act_specs, rewards=reward_specs, discounts=step_discount_specs, start_of_episode=specs.Array(shape=(), dtype=bool), extras=extras_spec, ) return tree.map_structure_with_path(base.spec_like_to_tensor_spec, spec_step)
def _write(self): # NOTE: we do not check that the buffer is of length N here. This means # that at the beginning of an episode we will add the initial N-1 # transitions (of size 1, 2, ...) and at the end of an episode (when # called from write_last) we will write the final transitions of size (N, # N-1, ...). See the Note in the docstring. # Form the n-step transition given the steps. observation = self._buffer[0].observation action = self._buffer[0].action extras = self._buffer[0].extras next_observation = self._next_observation # Give the same tree structure to the n-step return accumulator, # n-step discount accumulator, and self.discount, so that they can be # iterated in parallel using tree.map_structure. (n_step_return, total_discount, self_discount) = tree_utils.broadcast_structures( self._buffer[0].reward, self._buffer[0].discount, self._discount) # Copy total_discount, so that accumulating into it doesn't affect # _buffer[0].discount. total_discount = tree.map_structure(np.copy, total_discount) # Broadcast n_step_return to have the broadcasted shape of # reward * discount. Also copy, to avoid accumulating into # _buffer[0].reward. n_step_return = tree.map_structure( lambda r, d: np.copy(np.broadcast_to(r, np.broadcast(r, d).shape)), n_step_return, total_discount) # NOTE: total discount will have one less discount than it does # step.discounts. This is so that when the learner/update uses an additional # discount we don't apply it twice. Inside the following loop we will # apply this right before summing up the n_step_return. for step in itertools.islice(self._buffer, 1, None): (step_discount, step_reward, total_discount) = tree_utils.broadcast_structures( step.discount, step.reward, total_discount) # Equivalent to: `total_discount *= self._discount`. tree.map_structure(operator.imul, total_discount, self_discount) # Equivalent to: `n_step_return += step.reward * total_discount`. tree.map_structure(lambda nsr, sr, td: operator.iadd(nsr, sr * td), n_step_return, step_reward, total_discount) # Equivalent to: `total_discount *= step.discount`. tree.map_structure(operator.imul, total_discount, step_discount) transition = types.Transition(observation=observation, action=action, reward=n_step_return, discount=total_discount, next_observation=next_observation, extras=extras) # Create a list of steps. if self._final_step_placeholder is None: # utils.final_step_like is expensive (around 0.085ms) to run every time # so we cache its output. self._final_step_placeholder = utils.final_step_like( self._buffer[0], next_observation) final_step: base.Step = self._final_step_placeholder._replace( observation=next_observation) steps = list(self._buffer) + [final_step] # Calculate the priority for this transition. table_priorities = utils.calculate_priorities(self._priority_fns, steps) # Insert the transition into replay along with its priority. self._writer.append(transition) for table, priority in table_priorities.items(): self._writer.create_item(table=table, num_timesteps=1, priority=priority)