Exemplo n.º 1
0
    def test_make_dataset_with_sequence_length_and_batch_size(self):
        sequence_length = 6
        batch_size = 4
        environment = fakes.ContinuousEnvironment()
        environment_spec = specs.make_environment_spec(environment)
        dataset = reverb_dataset.make_dataset(
            client=self.tf_client,
            environment_spec=environment_spec,
            batch_size=batch_size,
            sequence_length=sequence_length)

        def make_tensor_spec(spec):
            return tf.TensorSpec(shape=(
                batch_size,
                sequence_length,
            ) + spec.shape,
                                 dtype=spec.dtype)

        expected_spec = tree.map_structure(make_tensor_spec, environment_spec)

        expected_spec = adders.Step(observation=expected_spec.observations,
                                    action=expected_spec.actions,
                                    reward=expected_spec.rewards,
                                    discount=expected_spec.discounts,
                                    start_of_episode=specs.Array(
                                        shape=(batch_size, sequence_length),
                                        dtype=bool),
                                    extras=())

        self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
Exemplo n.º 2
0
    def test_make_dataset_nested_specs(self):
        environment_spec = specs.EnvironmentSpec(observations={
            'obs_1':
            specs.Array((3, 64, 64), 'uint8'),
            'obs_2':
            specs.Array((10, ), 'int32')
        },
                                                 actions=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=-1.,
                                                     maximum=1.),
                                                 rewards=specs.Array(
                                                     (), 'float32'),
                                                 discounts=specs.BoundedArray(
                                                     (),
                                                     'float32',
                                                     minimum=0.,
                                                     maximum=1.))

        dataset = reverb_dataset.make_dataset(
            client=self.tf_client, environment_spec=environment_spec)

        expected_spec = adders.Step(observation=environment_spec.observations,
                                    action=environment_spec.actions,
                                    reward=environment_spec.rewards,
                                    discount=environment_spec.discounts,
                                    start_of_episode=specs.Array(shape=(),
                                                                 dtype=bool),
                                    extras=())

        self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
Exemplo n.º 3
0
 def _create_dummy_steps(self):
     return reverb_adders.Step(observation=self._DUMMY_OBS,
                               action=self._DUMMY_ACTION,
                               reward=self._DUMMY_REWARD,
                               discount=self._DUMMY_DISCOUNT,
                               start_of_episode=True,
                               extras={'return': self._DUMMY_RETURN})
Exemplo n.º 4
0
    def _read_results(self):
        while len(self._evaluation_state.received_results
                  ) != self._config.num_directions * 2:
            data = next(self._iterator).data
            data = acme_reverb.Step(*data)

            # validation
            params_key = data.extras['params_key']
            training_step, perturbation_id, is_opposite = params_key
            # If the incoming data does not correspond to the current iteration,
            # we simply ignore it.
            if not np.all(training_step[:-1] ==
                          self._training_state.training_iteration):
                continue

            # The whole episode should be run with the same policy, so let's check
            # for that.
            assert np.all(perturbation_id[:-1] == perturbation_id[0])
            assert np.all(is_opposite[:-1] == is_opposite[0])

            perturbation_id = perturbation_id[0].item()
            is_opposite = is_opposite[0].item()

            total_reward = np.sum(data.reward - self._config.reward_shift)
            k = PerturbationKey(self._training_state.training_iteration,
                                perturbation_id, is_opposite)
            if k in self._evaluation_state.received_results:
                continue
            self._evaluation_state.received_results[k] = EvaluationResult(
                total_reward, data.observation)
Exemplo n.º 5
0
    def test_make_dataset_simple(self):
        environment = fakes.ContinuousEnvironment()
        environment_spec = specs.make_environment_spec(environment)
        dataset = reverb_dataset.make_dataset(
            client=self.tf_client, environment_spec=environment_spec)

        expected_spec = adders.Step(observation=environment_spec.observations,
                                    action=environment_spec.actions,
                                    reward=environment_spec.rewards,
                                    discount=environment_spec.discounts,
                                    start_of_episode=specs.Array(shape=(),
                                                                 dtype=bool),
                                    extras=())
        self.assertTrue(_check_specs(expected_spec, dataset.element_spec.data))
Exemplo n.º 6
0
def _build_sequence_example(sequences):
    """Convert raw sequences into a Reverb sequence sample."""
    data = adders.Step(observation=sequences['observation'],
                       action=sequences['action'],
                       reward=sequences['reward'],
                       discount=sequences['discount'],
                       start_of_episode=(),
                       extras=())

    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    return reverb.ReplaySample(info=info, data=data)
Exemplo n.º 7
0
def _spec_to_shapes_and_dtypes(transition_adder: bool,
                               environment_spec: specs.EnvironmentSpec,
                               extra_spec: Optional[types.NestedSpec],
                               sequence_length: Optional[int],
                               convert_zero_size_to_none: bool,
                               using_deprecated_adder: bool):
    """Creates the shapes and dtypes needed to describe the Reverb dataset.

  This takes a `environment_spec`, `extra_spec`, and additional information and
  returns a tuple (shapes, dtypes) that describe the data contained in Reverb.

  Args:
    transition_adder: A boolean, describing if a `TransitionAdder` was used to
      add data.
    environment_spec: A `specs.EnvironmentSpec`, describing the shapes and
      dtypes of the data produced by the environment (and the action).
    extra_spec: A nested structure of objects with a `.shape` and `.dtype`
      property. This describes any additional data the Actor adds into Reverb.
    sequence_length: An optional integer for how long the added sequences are,
      only used with `SequenceAdder`.
    convert_zero_size_to_none: If True, then all shape dimensions that are 0 are
      converted to None. A None dimension is only set at runtime.
    using_deprecated_adder: True if the adder used to generate the data is
      from acme/adders/reverb/deprecated.

  Returns:
    A tuple (dtypes, shapes) that describes the data that has been added into
    Reverb.
  """
    # The *transition* adder is special in that it also adds an arrival state.
    if transition_adder:
        # Use the environment spec but convert it to a plain tuple.
        adder_spec = tuple(environment_spec) + (
            environment_spec.observations, )
        # Any 'extra' data that is passed to the adder is put on the end.
        if extra_spec:
            adder_spec += (extra_spec, )
    elif using_deprecated_adder and deprecated_base is not None:
        adder_spec = deprecated_base.Step(
            observation=environment_spec.observations,
            action=environment_spec.actions,
            reward=environment_spec.rewards,
            discount=environment_spec.discounts,
            extras=() if not extra_spec else extra_spec)
    else:
        adder_spec = adders.Step(observation=environment_spec.observations,
                                 action=environment_spec.actions,
                                 reward=environment_spec.rewards,
                                 discount=environment_spec.discounts,
                                 start_of_episode=specs.Array(shape=(),
                                                              dtype=bool),
                                 extras=() if not extra_spec else extra_spec)

    # Extract the shapes and dtypes from these specs.
    get_dtype = lambda x: tf.as_dtype(x.dtype)
    get_shape = lambda x: tf.TensorShape(x.shape)
    if sequence_length:
        get_shape = lambda x: tf.TensorShape([sequence_length, *x.shape])

    if convert_zero_size_to_none:
        # TODO(b/143692455): Consider making this default behaviour.
        get_shape = lambda x: tf.TensorShape(
            [s if s else None for s in x.shape])
    shapes = tree.map_structure(get_shape, adder_spec)
    dtypes = tree.map_structure(get_dtype, adder_spec)
    return shapes, dtypes
Exemplo n.º 8
0
Arquivo: agent.py Projeto: dzorlu/acme
def _sequence_from_episode(observations: acme_types.NestedTensor,
                           actions: tf.Tensor, rewards: tf.Tensor,
                           discounts: tf.Tensor,
                           extra_spec: acme_types.NestedSpec, period: int,
                           sequence_length: int):
    """Produce Reverb-like sequence from a full episode.

  Observations, actions, rewards and discounts have the same length. This
  function will ignore the first reward and discount and the last action.

  This function generates fake (all-zero) extras.

  See docs for reverb.SequenceAdder() for more details.

  Args:
    observations: [L, ...] Tensor.
    actions: [L, ...] Tensor.
    rewards: [L] Tensor.
    discounts: [L] Tensor.
    extra_spec: A possibly nested structure of specs for extras. This function
      will generate fake (all-zero) extras.
    period: The period with which we add sequences.
    sequence_length: The fixed length of sequences we wish to add.

  Returns:
    (o_t, a_t, r_t, d_t, e_t) Tuple.
  """

    length = tf.shape(rewards)[0]
    first = tf.random.uniform(shape=(),
                              minval=0,
                              maxval=length,
                              dtype=tf.int32)
    first = first // period * period  # Get a multiple of `period`.
    to = tf.minimum(first + sequence_length, length)

    def _slice_and_pad(x):
        pad_length = sequence_length + first - to
        padding_shape = tf.concat([[pad_length], tf.shape(x)[1:]], axis=0)
        result = tf.concat(
            [x[first:to], tf.zeros(padding_shape, x.dtype)], axis=0)
        result.set_shape([sequence_length] + x.shape.as_list()[1:])
        return result

    o_t = tree.map_structure(_slice_and_pad, observations)
    a_t = tree.map_structure(_slice_and_pad, actions)
    r_t = _slice_and_pad(rewards)
    d_t = _slice_and_pad(discounts)
    start_of_episode = tf.equal(first, 0)
    start_of_episode = tf.expand_dims(start_of_episode, axis=0)
    start_of_episode = tf.tile(start_of_episode, [sequence_length])

    def _sequence_zeros(spec):
        return tf.zeros([sequence_length] + spec.shape, spec.dtype)

    e_t = tree.map_structure(_sequence_zeros, extra_spec)

    key = tf.zeros([sequence_length], tf.uint64)
    probability = tf.ones([sequence_length], tf.float64)
    table_size = tf.ones([sequence_length], tf.int64)
    priority = tf.ones([sequence_length], tf.float64)
    info = reverb.SampleInfo(key=key,
                             probability=probability,
                             table_size=table_size,
                             priority=priority)
    return reverb.ReplaySample(info=info,
                               data=adders.Step(
                                   observation=o_t,
                                   action=a_t,
                                   reward=r_t,
                                   discount=d_t,
                                   start_of_episode=start_of_episode,
                                   extras=e_t))
Exemplo n.º 9
0
    def test_shapes(self):

        #
        batch_size = 2
        sequence_len = 3
        num_actions = 5
        hidden_size = 7

        # Define a trivial recurrent actor-critic network.
        @hk.without_apply_rng
        @hk.transform
        def unroll_fn(observations, state):
            lstm = hk.LSTM(hidden_size)
            embedding, state = hk.dynamic_unroll(lstm, observations, state)
            logits = hk.Linear(num_actions)(embedding)
            values = jnp.squeeze(hk.Linear(1)(embedding), axis=-1)

            return (logits, values), state

        @hk.without_apply_rng
        @hk.transform
        def initial_state_fn():
            return hk.LSTM(hidden_size).initial_state(None)

        # Initial recurrent network state.
        initial_state = initial_state_fn.apply(None)

        # Make some fake data.
        observations = np.ones(shape=(sequence_len, 50))
        actions = np.random.randint(num_actions, size=sequence_len)
        rewards = np.random.rand(sequence_len)
        discounts = np.ones(shape=(sequence_len, ))

        batch_tile = tree_map(
            lambda x: np.tile(x, [batch_size, *([1] * x.ndim)]))
        seq_tile = tree_map(
            lambda x: np.tile(x, [sequence_len, *([1] * x.ndim)]))

        extras = {
            'logits': np.random.rand(sequence_len, num_actions),
            'core_state': seq_tile(initial_state),
        }

        # Package up the data into a ReverbSample.
        data = adders.Step(observations,
                           actions,
                           rewards,
                           discounts,
                           extras=extras,
                           start_of_episode=())
        data = batch_tile(data)
        sample = reverb.ReplaySample(info=None, data=data)

        # Initialise parameters.
        rng = hk.PRNGSequence(1)
        params = unroll_fn.init(next(rng), observations, initial_state)

        # Make loss function.
        loss_fn = impala.impala_loss(unroll_fn, discount=0.99)

        # Return value should be scalar.
        loss = loss_fn(params, sample)
        loss = jax.device_get(loss)
        self.assertEqual(loss.shape, ())