def _write_last(self): # Create a final step. final_step = utils.final_step_like(self._buffer[0], self._next_observation) # Append the final step. self._buffer.append(final_step) self._writer.append(final_step) self._step += 1 # NOTE: this always pads to the fixed length. if self._pad_end_of_episode: zero_step = tree.map_structure(utils.zeros_like, final_step) # Determine how much padding to add. This makes sure that we add (zero) # data until the next time we would write a sequence. if self._step <= self._max_sequence_length: padding = self._max_sequence_length - self._step else: padding = self._period - (self._step - self._max_sequence_length) # Pad with zeros to get a full sequence. for _ in range(padding): self._buffer.append(zero_step) self._writer.append(zero_step) self._step += 1 # Write priorities for the sequence. self._maybe_add_priorities()
def _write_last(self): # Create a final step. final_step = utils.final_step_like(self._buffer[0], self._next_observation) # Append the final step. self._buffer.append(final_step) self._writer.append(final_step) self._step += 1 if not self._break_end_of_episode: # Write priorities for the sequence. self._maybe_add_priorities() # base.py has a check that on add_first self._next_observation should be # None, thus we need to clear it at the end of each episode. self._next_observation = None return # Determine the delta to the next time we would write a sequence. first_write = self._step <= self._max_sequence_length if first_write: delta = self._max_sequence_length - self._step else: delta = (self._period - (self._step - self._max_sequence_length)) % self._period # Bump up to the position where we will write a sequence. self._step += delta if self._pad_end_of_episode: zero_step = tree.map_structure(utils.zeros_like, final_step) # Pad with zeros to get a full sequence. for _ in range(delta): self._buffer.append(zero_step) self._writer.append(zero_step) elif not first_write: # Pop items from the buffer to get a truncated sequence. # Note: this is consistent with the padding loop above, since adding zero # steps pops the left-most elements. Here we just pop without padding. for _ in range(delta): self._buffer.popleft() # Write priorities for the sequence. self._maybe_add_priorities()
def _write(self): # NOTE: we do not check that the buffer is of length N here. This means # that at the beginning of an episode we will add the initial N-1 # transitions (of size 1, 2, ...) and at the end of an episode (when # called from write_last) we will write the final transitions of size (N, # N-1, ...). See the Note in the docstring. # Form the n-step transition given the steps. observation = self._buffer[0].observation action = self._buffer[0].action extras = self._buffer[0].extras next_observation = self._next_observation # Initialize the n-step return and the discount accumulators. We make a # copy of the first reward/discount so that when we add/multiply in place # it won't change the actual reward or discount. n_step_return = copy.deepcopy(self._buffer[0].reward) total_discount = copy.deepcopy(self._buffer[0].discount) # NOTE: total discount will have one less discount than it does # step.discounts. This is so that when the learner/update uses an additional # discount we don't apply it twice. Inside the following loop we will # apply this right before summing up the n_step_return. for step in itertools.islice(self._buffer, 1, None): total_discount *= self._discount n_step_return += step.reward * total_discount total_discount *= step.discount transition = (observation, action, n_step_return, total_discount, next_observation, extras) # Create a list of steps. final_step = utils.final_step_like(self._buffer[0], next_observation) steps = list(self._buffer) + [final_step] # Calculate the priority for this transition. table_priorities = utils.calculate_priorities(self._priority_fns, steps) # Insert the transition into replay along with its priority. self._writer.append(transition) for table, priority in table_priorities.items(): self._writer.create_item(table=table, num_timesteps=1, priority=priority)
def _write_last(self): # Append a zero-filled final step. final_step = utils.final_step_like(self._buffer[0], self._next_observation) self._writer.append(final_step) # The length of the sequence we will be adding is the size of the buffer # plus one due to the final step. steps = list(self._buffer) + [final_step] num_steps = len(steps) # Calculate the priority for this episode. table_priorities = utils.calculate_priorities(self._priority_fns, steps) # Create a prioritized item for each table. for table_name, priority in table_priorities.items(): self._writer.create_item(table_name, num_steps, priority)
def _write_last(self): # Create a final step. final_step = utils.final_step_like(self._buffer[0], self._next_observation) # Append the final step. self._buffer.append(final_step) self._writer.append(final_step) self._step += 1 # NOTE: this always pads to the fixed length. if self._pad_end_of_episode: zero_step = tree.map_structure(utils.zeros_like, final_step) # Determine how much padding to add. This makes sure that we add (zero) # data until the next time we would write a sequence. if self._step <= self._max_sequence_length: padding = self._max_sequence_length - self._step else: # The following is the original line from acme.adders.reverb.sequence.py. # However, it does not work as expected: as self._step grows, padding will become # a negative value at some point and as a result, the for loop below will never be # entered because of the negative value passed to range(). # padding = self._period - (self._step - self._max_sequence_length) # The following 3 lines work as expected and always pad to the fixed length (with zeros). padding = 0 if (self._step - self._max_sequence_length) % self._period != 0: padding = self._period - ( (self._step - self._max_sequence_length) % self._period) # Pad with zeros to get a full sequence. for _ in range(padding): self._buffer.append(zero_step) self._writer.append(zero_step) self._step += 1 # Write priorities for the sequence. self._maybe_add_priorities()
def _write(self): # NOTE: we do not check that the buffer is of length N here. This means # that at the beginning of an episode we will add the initial N-1 # transitions (of size 1, 2, ...) and at the end of an episode (when # called from write_last) we will write the final transitions of size (N, # N-1, ...). See the Note in the docstring. # Form the n-step transition given the steps. observation = self._buffer[0].observation action = self._buffer[0].action extras = self._buffer[0].extras next_observation = self._next_observation # Give the same tree structure to the n-step return accumulator, # n-step discount accumulator, and self.discount, so that they can be # iterated in parallel using tree.map_structure. (n_step_return, total_discount, self_discount) = tree_utils.broadcast_structures( self._buffer[0].reward, self._buffer[0].discount, self._discount) # Copy total_discount, so that accumulating into it doesn't affect # _buffer[0].discount. total_discount = tree.map_structure(np.copy, total_discount) # Broadcast n_step_return to have the broadcasted shape of # reward * discount. Also copy, to avoid accumulating into # _buffer[0].reward. n_step_return = tree.map_structure( lambda r, d: np.copy(np.broadcast_to(r, np.broadcast(r, d).shape)), n_step_return, total_discount) # NOTE: total discount will have one less discount than it does # step.discounts. This is so that when the learner/update uses an additional # discount we don't apply it twice. Inside the following loop we will # apply this right before summing up the n_step_return. for step in itertools.islice(self._buffer, 1, None): (step_discount, step_reward, total_discount) = tree_utils.broadcast_structures( step.discount, step.reward, total_discount) # Equivalent to: `total_discount *= self._discount`. tree.map_structure(operator.imul, total_discount, self_discount) # Equivalent to: `n_step_return += step.reward * total_discount`. tree.map_structure(lambda nsr, sr, td: operator.iadd(nsr, sr * td), n_step_return, step_reward, total_discount) # Equivalent to: `total_discount *= step.discount`. tree.map_structure(operator.imul, total_discount, step_discount) transition = types.Transition(observation=observation, action=action, reward=n_step_return, discount=total_discount, next_observation=next_observation, extras=extras) # Create a list of steps. if self._final_step_placeholder is None: # utils.final_step_like is expensive (around 0.085ms) to run every time # so we cache its output. self._final_step_placeholder = utils.final_step_like( self._buffer[0], next_observation) final_step: base.Step = self._final_step_placeholder._replace( observation=next_observation) steps = list(self._buffer) + [final_step] # Calculate the priority for this transition. table_priorities = utils.calculate_priorities(self._priority_fns, steps) # Insert the transition into replay along with its priority. self._writer.append(transition) for table, priority in table_priorities.items(): self._writer.create_item(table=table, num_timesteps=1, priority=priority)