Exemplo n.º 1
0
    def _write_last(self):
        # Create a final step.
        final_step = utils.final_step_like(self._buffer[0],
                                           self._next_observation)

        # Append the final step.
        self._buffer.append(final_step)
        self._writer.append(final_step)
        self._step += 1

        # NOTE: this always pads to the fixed length.
        if self._pad_end_of_episode:
            zero_step = tree.map_structure(utils.zeros_like, final_step)
            # Determine how much padding to add. This makes sure that we add (zero)
            # data until the next time we would write a sequence.
            if self._step <= self._max_sequence_length:
                padding = self._max_sequence_length - self._step
            else:
                padding = self._period - (self._step -
                                          self._max_sequence_length)

            # Pad with zeros to get a full sequence.
            for _ in range(padding):
                self._buffer.append(zero_step)
                self._writer.append(zero_step)
                self._step += 1

        # Write priorities for the sequence.
        self._maybe_add_priorities()
Exemplo n.º 2
0
    def _write_last(self):
        # Create a final step.
        final_step = utils.final_step_like(self._buffer[0],
                                           self._next_observation)

        # Append the final step.
        self._buffer.append(final_step)
        self._writer.append(final_step)
        self._step += 1

        if not self._break_end_of_episode:
            # Write priorities for the sequence.
            self._maybe_add_priorities()

            # base.py has a check that on add_first self._next_observation should be
            # None, thus we need to clear it at the end of each episode.
            self._next_observation = None
            return

        # Determine the delta to the next time we would write a sequence.
        first_write = self._step <= self._max_sequence_length
        if first_write:
            delta = self._max_sequence_length - self._step
        else:
            delta = (self._period -
                     (self._step - self._max_sequence_length)) % self._period

        # Bump up to the position where we will write a sequence.
        self._step += delta

        if self._pad_end_of_episode:
            zero_step = tree.map_structure(utils.zeros_like, final_step)

            # Pad with zeros to get a full sequence.
            for _ in range(delta):
                self._buffer.append(zero_step)
                self._writer.append(zero_step)
        elif not first_write:
            # Pop items from the buffer to get a truncated sequence.
            # Note: this is consistent with the padding loop above, since adding zero
            # steps pops the left-most elements. Here we just pop without padding.
            for _ in range(delta):
                self._buffer.popleft()

        # Write priorities for the sequence.
        self._maybe_add_priorities()
Exemplo n.º 3
0
    def _write(self):
        # NOTE: we do not check that the buffer is of length N here. This means
        # that at the beginning of an episode we will add the initial N-1
        # transitions (of size 1, 2, ...) and at the end of an episode (when
        # called from write_last) we will write the final transitions of size (N,
        # N-1, ...). See the Note in the docstring.

        # Form the n-step transition given the steps.
        observation = self._buffer[0].observation
        action = self._buffer[0].action
        extras = self._buffer[0].extras
        next_observation = self._next_observation

        # Initialize the n-step return and the discount accumulators. We make a
        # copy of the first reward/discount so that when we add/multiply in place
        # it won't change the actual reward or discount.
        n_step_return = copy.deepcopy(self._buffer[0].reward)
        total_discount = copy.deepcopy(self._buffer[0].discount)

        # NOTE: total discount will have one less discount than it does
        # step.discounts. This is so that when the learner/update uses an additional
        # discount we don't apply it twice. Inside the following loop we will
        # apply this right before summing up the n_step_return.
        for step in itertools.islice(self._buffer, 1, None):
            total_discount *= self._discount
            n_step_return += step.reward * total_discount
            total_discount *= step.discount

        transition = (observation, action, n_step_return, total_discount,
                      next_observation, extras)

        # Create a list of steps.
        final_step = utils.final_step_like(self._buffer[0], next_observation)
        steps = list(self._buffer) + [final_step]

        # Calculate the priority for this transition.
        table_priorities = utils.calculate_priorities(self._priority_fns,
                                                      steps)

        # Insert the transition into replay along with its priority.
        self._writer.append(transition)
        for table, priority in table_priorities.items():
            self._writer.create_item(table=table,
                                     num_timesteps=1,
                                     priority=priority)
Exemplo n.º 4
0
    def _write_last(self):
        # Append a zero-filled final step.
        final_step = utils.final_step_like(self._buffer[0],
                                           self._next_observation)
        self._writer.append(final_step)

        # The length of the sequence we will be adding is the size of the buffer
        # plus one due to the final step.
        steps = list(self._buffer) + [final_step]
        num_steps = len(steps)

        # Calculate the priority for this episode.
        table_priorities = utils.calculate_priorities(self._priority_fns,
                                                      steps)

        # Create a prioritized item for each table.
        for table_name, priority in table_priorities.items():
            self._writer.create_item(table_name, num_steps, priority)
Exemplo n.º 5
0
    def _write_last(self):
        # Create a final step.
        final_step = utils.final_step_like(self._buffer[0],
                                           self._next_observation)

        # Append the final step.
        self._buffer.append(final_step)
        self._writer.append(final_step)
        self._step += 1

        # NOTE: this always pads to the fixed length.
        if self._pad_end_of_episode:
            zero_step = tree.map_structure(utils.zeros_like, final_step)
            # Determine how much padding to add. This makes sure that we add (zero)
            # data until the next time we would write a sequence.
            if self._step <= self._max_sequence_length:
                padding = self._max_sequence_length - self._step
            else:
                # The following is the original line from acme.adders.reverb.sequence.py.
                # However, it does not work as expected: as self._step grows, padding will become
                # a negative value at some point and as a result, the for loop below will never be
                # entered because of the negative value passed to range().

                # padding = self._period - (self._step - self._max_sequence_length)

                # The following 3 lines work as expected and always pad to the fixed length (with zeros).
                padding = 0
                if (self._step -
                        self._max_sequence_length) % self._period != 0:
                    padding = self._period - (
                        (self._step - self._max_sequence_length) %
                        self._period)

            # Pad with zeros to get a full sequence.
            for _ in range(padding):
                self._buffer.append(zero_step)
                self._writer.append(zero_step)
                self._step += 1

        # Write priorities for the sequence.
        self._maybe_add_priorities()
Exemplo n.º 6
0
    def _write(self):
        # NOTE: we do not check that the buffer is of length N here. This means
        # that at the beginning of an episode we will add the initial N-1
        # transitions (of size 1, 2, ...) and at the end of an episode (when
        # called from write_last) we will write the final transitions of size (N,
        # N-1, ...). See the Note in the docstring.

        # Form the n-step transition given the steps.
        observation = self._buffer[0].observation
        action = self._buffer[0].action
        extras = self._buffer[0].extras
        next_observation = self._next_observation

        # Give the same tree structure to the n-step return accumulator,
        # n-step discount accumulator, and self.discount, so that they can be
        # iterated in parallel using tree.map_structure.
        (n_step_return, total_discount,
         self_discount) = tree_utils.broadcast_structures(
             self._buffer[0].reward, self._buffer[0].discount, self._discount)

        # Copy total_discount, so that accumulating into it doesn't affect
        # _buffer[0].discount.
        total_discount = tree.map_structure(np.copy, total_discount)

        # Broadcast n_step_return to have the broadcasted shape of
        # reward * discount. Also copy, to avoid accumulating into
        # _buffer[0].reward.
        n_step_return = tree.map_structure(
            lambda r, d: np.copy(np.broadcast_to(r,
                                                 np.broadcast(r, d).shape)),
            n_step_return, total_discount)

        # NOTE: total discount will have one less discount than it does
        # step.discounts. This is so that when the learner/update uses an additional
        # discount we don't apply it twice. Inside the following loop we will
        # apply this right before summing up the n_step_return.
        for step in itertools.islice(self._buffer, 1, None):
            (step_discount, step_reward,
             total_discount) = tree_utils.broadcast_structures(
                 step.discount, step.reward, total_discount)

            # Equivalent to: `total_discount *= self._discount`.
            tree.map_structure(operator.imul, total_discount, self_discount)

            # Equivalent to: `n_step_return += step.reward * total_discount`.
            tree.map_structure(lambda nsr, sr, td: operator.iadd(nsr, sr * td),
                               n_step_return, step_reward, total_discount)

            # Equivalent to: `total_discount *= step.discount`.
            tree.map_structure(operator.imul, total_discount, step_discount)

        transition = types.Transition(observation=observation,
                                      action=action,
                                      reward=n_step_return,
                                      discount=total_discount,
                                      next_observation=next_observation,
                                      extras=extras)

        # Create a list of steps.
        if self._final_step_placeholder is None:
            # utils.final_step_like is expensive (around 0.085ms) to run every time
            # so we cache its output.
            self._final_step_placeholder = utils.final_step_like(
                self._buffer[0], next_observation)
        final_step: base.Step = self._final_step_placeholder._replace(
            observation=next_observation)
        steps = list(self._buffer) + [final_step]

        # Calculate the priority for this transition.
        table_priorities = utils.calculate_priorities(self._priority_fns,
                                                      steps)

        # Insert the transition into replay along with its priority.
        self._writer.append(transition)
        for table, priority in table_priorities.items():
            self._writer.create_item(table=table,
                                     num_timesteps=1,
                                     priority=priority)