Пример #1
0
def _generate_sqil_samples(
    demonstration_iterator: Iterator[types.Transition],
    replay_iterator: Iterator[reverb.ReplaySample]
) -> Iterator[reverb.ReplaySample]:
    """Generator which creates the sample iterator for SQIL.

  Args:
    demonstration_iterator: Iterator of demonstrations.
    replay_iterator: Replay buffer sample iterator.

  Yields:
    Samples having a mix of demonstrations with reward 1 and replay samples with
    reward 0.
  """
    for demonstrations, replay_sample in zip(demonstration_iterator,
                                             replay_iterator):
        demonstrations = demonstrations._replace(
            reward=np.ones_like(demonstrations.reward))

        replay_transitions = replay_sample.data
        replay_transitions = replay_transitions._replace(
            reward=np.zeros_like(replay_transitions.reward))

        double_batch = tree.map_structure(lambda x, y: np.concatenate([x, y]),
                                          demonstrations, replay_transitions)

        # Split the double batch in an interleaving fashion.
        # e.g [1, 2, 3, 4 ,5 ,6] -> [1, 3, 5] and [2, 4, 6]
        yield reverb.ReplaySample(info=replay_sample.info,
                                  data=tree.map_structure(
                                      lambda x: x[0::2], double_batch))
        yield reverb.ReplaySample(info=replay_sample.info,
                                  data=tree.map_structure(
                                      lambda x: x[1::2], double_batch))
Пример #2
0
        def normalize_sample(
            observation_statistics: running_statistics.RunningStatisticsState,
            sample: reverb.ReplaySample
        ) -> Tuple[running_statistics.RunningStatisticsState,
                   reverb.ReplaySample]:
            observation = sample.data.observation
            observation_statistics = running_statistics.update(
                observation_statistics, observation)
            observation = running_statistics.normalize(
                observation,
                observation_statistics,
                max_abs_value=max_abs_observation)
            if is_sequence_based:
                assert not hasattr(sample.data, 'next_observation')
                sample = reverb.ReplaySample(
                    sample.info, sample.data._replace(observation=observation))
            else:
                next_observation = running_statistics.normalize(
                    sample.data.next_observation,
                    observation_statistics,
                    max_abs_value=max_abs_observation)
                sample = reverb.ReplaySample(
                    sample.info,
                    sample.data._replace(observation=observation,
                                         next_observation=next_observation))

            return observation_statistics, sample
Пример #3
0
    def test_weighted_generator(self):
        data0 = types.Transition(np.array([[1], [2], [3]]), (), _REWARD, (),
                                 ())
        it0 = iter([data0])

        data1 = types.Transition(np.array([[4], [5], [6]]), (), _REWARD, (),
                                 ())
        data2 = types.Transition(np.array([[7], [8], [9]]), (), _REWARD, (),
                                 ())
        it1 = iter([
            reverb.ReplaySample(info=reverb.SampleInfo(
                *[() for _ in reverb.SampleInfo.tf_dtypes()]),
                                data=data1),
            reverb.ReplaySample(info=reverb.SampleInfo(
                *[() for _ in reverb.SampleInfo.tf_dtypes()]),
                                data=data2)
        ])

        weighted_it = builder._generate_samples_with_demonstrations(
            it0, it1, policy_to_expert_data_ratio=2, batch_size=3)

        np.testing.assert_array_equal(
            next(weighted_it).data.observation, np.array([[1], [4], [5]]))
        np.testing.assert_array_equal(
            next(weighted_it).data.observation, np.array([[7], [8], [2]]))
        self.assertRaises(StopIteration, lambda: next(weighted_it))
Пример #4
0
def transition_dataset(environment: dm_env.Environment) -> tf.data.Dataset:
    """Fake dataset of Reverb N-step transition samples.

  Args:
    environment: Used to create a fake transition by looking at the
      observation, action, discount and reward specs.

  Returns:
    tf.data.Dataset that produces the same fake N-step transition ReverSample
    object indefinitely.
  """

    observation = environment.observation_spec().generate_value()
    action = environment.action_spec().generate_value()
    reward = environment.reward_spec().generate_value()
    discount = environment.discount_spec().generate_value()
    data = (observation, action, reward, discount, observation)

    key = np.array(0, np.uint64)
    probability = np.array(1.0, np.float64)
    table_size = np.array(1, np.int64)
    priority = np.array(1.0, np.float64)
    info = reverb.SampleInfo(key=key,
                             probability=probability,
                             table_size=table_size,
                             priority=priority)
    sample = reverb.ReplaySample(info=info, data=data)

    return tf.data.Dataset.from_tensors(sample).repeat()
Пример #5
0
def _n_step_transition_from_episode(observations: acme_types.NestedTensor,
                                    actions: tf.Tensor,
                                    rewards: tf.Tensor,
                                    discounts: tf.Tensor,
                                    n_step: int,
                                    discount: float):
  """Produce Reverb-like N-step transition from a full episode.

  Observations, actions, rewards and discounts have the same length. This
  function will ignore the first reward and discount and the last action.

  Args:
    observations: [L, ...] Tensor.
    actions: [L, ...] Tensor.
    rewards: [L] Tensor.
    discounts: [L] Tensor.
    n_step: number of steps to squash into a single transition.
    discount: discount to use for TD updates.

  Returns:
    (o_t, a_t, r_t, d_t, o_tp1) tuple.
  """

  max_index = tf.shape(rewards)[0] - 1
  first = tf.random.uniform(shape=(), minval=0, maxval=max_index - 1,
                            dtype=tf.int32)
  last = tf.minimum(first + n_step, max_index)

  o_t = tree.map_structure(operator.itemgetter(first), observations)
  a_t = tree.map_structure(operator.itemgetter(first), actions)
  o_tp1 = tree.map_structure(operator.itemgetter(last), observations)

  # 0, 1, ..., n-1.
  discount_range = tf.cast(tf.range(last - first), tf.float32)
  # 1, g, ..., g^{n-1}.
  additional_discounts = tf.pow(discount, discount_range)
  # 1, d_t, d_t * d_{t+1}, ..., d_t * ... * d_{t+n-2}.
  discounts = tf.concat([[1.], tf.math.cumprod(discounts[first:last-1])], 0)
  # 1, g * d_t, ..., g^{n-1} * d_t * ... * d_{t+n-2}.
  discounts *= additional_discounts
  # r_t + g * d_t * r_{t+1} + ... + g^{n-1} * d_t * ... * d_{t+n-2} * r_{t+n-1}
  # We have to shift rewards by one so last=max_index corresponds to transitions
  # that include the last reward.
  r_t = tf.reduce_sum(rewards[first+1:last+1] * discounts)

  # g^{n-1} * d_{t} * ... * d_{t+n-1}.
  d_t = discounts[-1]

  key = tf.constant(0, tf.uint64)
  probability = tf.constant(1.0, tf.float64)
  table_size = tf.constant(1, tf.int64)
  priority = tf.constant(1.0, tf.float64)
  info = reverb.SampleInfo(
      key=key,
      probability=probability,
      table_size=table_size,
      priority=priority)
  return reverb.ReplaySample(
      info=info, data=acme_types.Transition(o_t, a_t, r_t, d_t, o_tp1))
Пример #6
0
 def test_replay_sample_to_sars_transition_is_sequence(self):
   fake_sample = reverb.ReplaySample(
       info=self._EMPTY_INFO, data=self._create_dummy_steps())
   fake_transition = self._create_dummy_transitions()
   transition_from_sample = reverb_utils.replay_sample_to_sars_transition(
       fake_sample, is_sequence=True)
   tree.map_structure(np.testing.assert_array_equal, transition_from_sample,
                      fake_transition)
Пример #7
0
def _sequence_from_episode(observations: acme_types.NestedTensor,
                           actions: tf.Tensor,
                           rewards: tf.Tensor,
                           discounts: tf.Tensor,
                           extra_spec: acme_types.NestedSpec,
                           period: int,
                           sequence_length: int):
  """Produce Reverb-like sequence from a full episode.

  Observations, actions, rewards and discounts have the same length. This
  function will ignore the first reward and discount and the last action.

  This function generates fake (all-zero) extras.

  See docs for reverb.SequenceAdder() for more details.

  Args:
    observations: [L, ...] Tensor.
    actions: [L, ...] Tensor.
    rewards: [L] Tensor.
    discounts: [L] Tensor.
    extra_spec: A possibly nested structure of specs for extras. This function
      will generate fake (all-zero) extras.
    period: The period with which we add sequences.
    sequence_length: The fixed length of sequences we wish to add.

  Returns:
    (o_t, a_t, r_t, d_t, e_t) Tuple.
  """

  length = tf.shape(rewards)[0]
  first = tf.random.uniform(shape=(), minval=0, maxval=length, dtype=tf.int32)
  first = first // period * period  # Get a multiple of `period`.
  to = tf.minimum(first + sequence_length, length)

  def _slice_and_pad(x):
    pad_length = sequence_length + first - to
    padding_shape = tf.concat([[pad_length], tf.shape(x)[1:]], axis=0)
    result = tf.concat([x[first:to], tf.zeros(padding_shape, x.dtype)], axis=0)
    result.set_shape([sequence_length] + x.shape.as_list()[1:])
    return result

  o_t = tree.map_structure(_slice_and_pad, observations)
  a_t = tree.map_structure(_slice_and_pad, actions)
  r_t = _slice_and_pad(rewards)
  d_t = _slice_and_pad(discounts)

  def _sequence_zeros(spec):
    return tf.zeros([sequence_length] + spec.shape, spec.dtype)

  e_t = tree.map_structure(_sequence_zeros, extra_spec)

  key = tf.zeros([sequence_length], tf.uint64)
  probability = tf.ones([sequence_length], tf.float64)
  table_size = tf.ones([sequence_length], tf.int64)
  info = reverb.SampleInfo(
      key=key, probability=probability, table_size=table_size)
  return reverb.ReplaySample(info=info, data=(o_t, a_t, r_t, d_t, e_t))
Пример #8
0
def _generate_aquadem_samples(
    demonstration_iterator,
    replay_iterator, encoder_apply,
    params, ratio,
    min_demo_reward):
  """Generator which creates the sample iterator for Aquadem.

  Args:
    demonstration_iterator: Iterator of demonstrations.
    replay_iterator: Replay buffer sample iterator.
    encoder_apply: encoder apply function
    params: parameters of the encoder
    ratio: probability with which we sample the expert demonstration
    min_demo_reward: minimum reward

  Yields:
    A batch of demonstrations or a batch of interaction from the replay buffer.
  """
  if min_demo_reward is None:
    min_demo_reward = -1e10

  for demonstrations, replay_sample in zip(demonstration_iterator,
                                           replay_iterator):

    if np.random.random() < ratio:
      continuous_actions_candidates = encoder_apply(
          params, demonstrations.observation)

      discrete_actions = np.argmin(
          np.linalg.norm(
              continuous_actions_candidates - demonstrations.action[:, :, None],
              axis=1),
          axis=-1)

      demonstrations = demonstrations._replace(
          action=discrete_actions,
          reward=np.maximum(min_demo_reward, demonstrations.reward))
      yield reverb.ReplaySample(info=replay_sample.info, data=demonstrations)

    else:
      yield reverb.ReplaySample(
          info=replay_sample.info, data=replay_sample.data)
Пример #9
0
def _build_sequence_example(sequences):
    """Convert raw sequences into a Reverb sequence sample."""
    o = sequences['observation']
    a = sequences['action']
    r = sequences['reward']
    p = sequences['discount']

    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    return reverb.ReplaySample(info=info, data=(o, a, r, p))
Пример #10
0
def _build_sequence_example(sequences):
    """Convert raw sequences into a Reverb sequence sample."""
    data = adders.Step(observation=sequences['observation'],
                       action=sequences['action'],
                       reward=sequences['reward'],
                       discount=sequences['discount'],
                       start_of_episode=(),
                       extras=())

    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    return reverb.ReplaySample(info=info, data=data)
Пример #11
0
def _add_next_action_extras(
        double_transitions: Transition) -> reverb.ReplaySample:
    # As TD3 is online by default, it expects an iterator over replay samples.
    info = tree.map_structure(lambda dtype: tf.ones([], dtype),
                              reverb.SampleInfo.tf_dtypes())
    return reverb.ReplaySample(
        info=info,
        data=Transition(
            observation=double_transitions.observation[0],
            action=double_transitions.action[0],
            reward=double_transitions.reward[0],
            discount=double_transitions.discount[0],
            next_observation=double_transitions.next_observation[0],
            extras={'next_action': double_transitions.action[1]}))
Пример #12
0
def _build_sarsa_example(sequences):
    """Convert raw sequences into a Reverb n-step SARSA sample."""

    o_tm1 = tree.map_structure(lambda t: t[0], sequences['observation'])
    o_t = tree.map_structure(lambda t: t[1], sequences['observation'])
    a_tm1 = tree.map_structure(lambda t: t[0], sequences['action'])
    a_t = tree.map_structure(lambda t: t[1], sequences['action'])
    r_t = tree.map_structure(lambda t: t[0], sequences['reward'])
    p_t = tree.map_structure(lambda t: t[0], sequences['discount'])

    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    return reverb.ReplaySample(info=info,
                               data=(o_tm1, a_tm1, r_t, p_t, o_t, a_t))
Пример #13
0
 def test_sqil_iterator(self):
     demonstrations = [
         types.Transition(np.array([[1], [2], [3]]), (), (), (), ())
     ]
     replay = [
         reverb.ReplaySample(info=(),
                             data=types.Transition(
                                 np.array([[4], [5], [6]]), (), (), (), ()))
     ]
     sqil_it = builder._generate_sqil_samples(iter(demonstrations),
                                              iter(replay))
     np.testing.assert_array_equal(
         next(sqil_it).data.observation, np.array([[1], [3], [5]]))
     np.testing.assert_array_equal(
         next(sqil_it).data.observation, np.array([[2], [4], [6]]))
     self.assertRaises(StopIteration, lambda: next(sqil_it))
Пример #14
0
def tf_example_to_reverb_sample(example,
                                feature_description,
                                num_timesteps=DEFAULT_NUM_TIMESTEPS):
    """Converts the episode encoded as a tf example into SARSA reverb samples."""
    example = tf.io.parse_single_example(example, feature_description)
    kv = tree_deflatten_with_delimiter(example)
    output = (
        get_slice_of_nested(kv['observation'], 0, num_timesteps - 1),
        get_slice_of_nested(kv['action'], 1, num_timesteps),
        kv['reward'][1:num_timesteps],
        # The two fields below aren't needed for learning,
        # but are kept here to be compatible with acme learner format.
        kv['discount'][1:num_timesteps],
        get_slice_of_nested(kv['observation'], 1, num_timesteps),
        repeat_last_and_append_to_nested(
            get_slice_of_nested(kv['action'], 2, num_timesteps)))
    ret = tf.data.Dataset.from_tensor_slices(output)
    ret = ret.map(lambda *x: reverb.ReplaySample(info=b'None', data=x))  # pytype: disable=wrong-arg-types
    return ret
Пример #15
0
def _generate_samples_with_demonstrations(
        demonstration_iterator: Iterator[types.Transition],
        replay_iterator: Iterator[reverb.ReplaySample],
        policy_to_expert_data_ratio: int,
        batch_size) -> Iterator[reverb.ReplaySample]:
    """Generator which creates the sample having demonstrations in them.

  It takes the demonstrations and replay iterators and generates batches with
  same size as the replay iterator, such that each batches have the ratio of
  policy and expert data specified in policy_to_expert_data_ratio on average.
  There is no constraints on how the demonstrations and replay samples should be
  batched.

  Args:
    demonstration_iterator: Iterator of demonstrations.
    replay_iterator: Replay buffer sample iterator.
    policy_to_expert_data_ratio: Amount of policy transitions for 1 expert
      transitions.
    batch_size: Output batch size, which should match the replay batch size.

  Yields:
    Samples having a mix of demonstrations and policy data. The info will match
    the current replay sample info and the batch size will be the same as the
    replay_iterator data batch size.
  """
    count = 0
    if batch_size % (policy_to_expert_data_ratio + 1) != 0:
        raise ValueError(
            'policy_to_expert_data_ratio + 1 must divide the batch size but '
            f'{batch_size} % {policy_to_expert_data_ratio+1} !=0')
    demo_insertion_size = batch_size // (policy_to_expert_data_ratio + 1)
    policy_insertion_size = batch_size - demo_insertion_size

    demonstration_iterator = _rebatch(demonstration_iterator,
                                      demo_insertion_size)
    for sample, demos in zip(replay_iterator, demonstration_iterator):
        output_transitions = tree.map_structure(
            functools.partial(_mix_arrays,
                              index=policy_insertion_size,
                              seed=count), sample.data, demos)
        count += 1
        yield reverb.ReplaySample(info=sample.info, data=output_transitions)
def _make_reverb_sample(o_t, a_t, r_t, d_t, o_tp1, a_tp1, extras):
    """Create Reverb sample with offline data.

  Args:
    o_t: Observation at time t.
    a_t: Action at time t.
    r_t: Reward at time t.
    d_t: Discount at time t.
    o_tp1: Observation at time t+1.
    a_tp1: Action at time t+1.
    extras: Dictionary with extra features.

  Returns:
    Replay sample with fake info: key=0, probability=1, table_size=0.
  """
    info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                             probability=tf.constant(1.0, tf.float64),
                             table_size=tf.constant(0, tf.int64),
                             priority=tf.constant(1.0, tf.float64))
    data = (o_t, a_t, r_t, d_t, o_tp1, a_tp1, extras)
    return reverb.ReplaySample(info=info, data=data)
Пример #17
0
def transition_dataset_from_spec(
        spec: specs.EnvironmentSpec) -> tf.data.Dataset:
    """Constructs fake dataset of Reverb N-step transition samples.

  Args:
    spec: Constructed fake transitions match the provided specification.

  Returns:
    tf.data.Dataset that produces the same fake N-step transition ReverbSample
    object indefinitely.
  """

    observation = _generate_from_spec(spec.observations)
    action = _generate_from_spec(spec.actions)
    reward = _generate_from_spec(spec.rewards)
    discount = _generate_from_spec(spec.discounts)
    data = types.Transition(observation, action, reward, discount, observation)

    info = tree.map_structure(
        lambda tf_dtype: tf.ones([], tf_dtype.as_numpy_dtype),
        reverb.SampleInfo.tf_dtypes())
    sample = reverb.ReplaySample(info=info, data=data)

    return tf.data.Dataset.from_tensors(sample).repeat()
Пример #18
0
def transition_dataset(environment: dm_env.Environment) -> tf.data.Dataset:
    """Fake dataset of Reverb N-step transition samples.

  Args:
    environment: Used to create a fake transition by looking at the observation,
      action, discount and reward specs.

  Returns:
    tf.data.Dataset that produces the same fake N-step transition ReverSample
    object indefinitely.
  """

    observation = environment.observation_spec().generate_value()
    action = environment.action_spec().generate_value()
    reward = environment.reward_spec().generate_value()
    discount = environment.discount_spec().generate_value()
    data = types.Transition(observation, action, reward, discount, observation)

    info = tree.map_structure(
        lambda tf_dtype: tf.ones([], tf_dtype.as_numpy_dtype),
        reverb.SampleInfo.tf_dtypes())
    sample = reverb.ReplaySample(info=info, data=data)

    return tf.data.Dataset.from_tensors(sample).repeat()
Пример #19
0
 def _reverb_sample(*data_tuple):
     info = reverb.SampleInfo(key=tf.constant(0, tf.uint64),
                              probability=tf.constant(1.0, tf.float64),
                              table_size=tf.constant(0, tf.int64),
                              priority=tf.constant(1.0, tf.float64))
     return reverb.ReplaySample(info=info, data=data_tuple)
Пример #20
0
 def add_info_fn(data):
     info = reverb.SampleInfo(key=0,
                              probability=0.0,
                              table_size=0,
                              priority=0.0)
     return reverb.ReplaySample(info=info, data=data)
Пример #21
0
    def test_shapes(self):

        #
        batch_size = 2
        sequence_len = 3
        num_actions = 5
        hidden_size = 7

        # Define a trivial recurrent actor-critic network.
        @hk.without_apply_rng
        @hk.transform
        def unroll_fn(observations, state):
            lstm = hk.LSTM(hidden_size)
            embedding, state = hk.dynamic_unroll(lstm, observations, state)
            logits = hk.Linear(num_actions)(embedding)
            values = jnp.squeeze(hk.Linear(1)(embedding), axis=-1)

            return (logits, values), state

        @hk.without_apply_rng
        @hk.transform
        def initial_state_fn():
            return hk.LSTM(hidden_size).initial_state(None)

        # Initial recurrent network state.
        initial_state = initial_state_fn.apply(None)

        # Make some fake data.
        observations = np.ones(shape=(sequence_len, 50))
        actions = np.random.randint(num_actions, size=sequence_len)
        rewards = np.random.rand(sequence_len)
        discounts = np.ones(shape=(sequence_len, ))

        batch_tile = tree_map(
            lambda x: np.tile(x, [batch_size, *([1] * x.ndim)]))
        seq_tile = tree_map(
            lambda x: np.tile(x, [sequence_len, *([1] * x.ndim)]))

        extras = {
            'logits': np.random.rand(sequence_len, num_actions),
            'core_state': seq_tile(initial_state),
        }

        # Package up the data into a ReverbSample.
        data = adders.Step(observations,
                           actions,
                           rewards,
                           discounts,
                           extras=extras,
                           start_of_episode=())
        data = batch_tile(data)
        sample = reverb.ReplaySample(info=None, data=data)

        # Initialise parameters.
        rng = hk.PRNGSequence(1)
        params = unroll_fn.init(next(rng), observations, initial_state)

        # Make loss function.
        loss_fn = impala.impala_loss(unroll_fn, discount=0.99)

        # Return value should be scalar.
        loss = loss_fn(params, sample)
        loss = jax.device_get(loss)
        self.assertEqual(loss.shape, ())
Пример #22
0
def transition_to_replaysample(
        transitions: types.Transition) -> reverb.ReplaySample:
    """Converts a types.Transition to a reverb.ReplaySample."""
    info = tree.map_structure(lambda dtype: tf.ones([], dtype),
                              reverb.SampleInfo.tf_dtypes())
    return reverb.ReplaySample(info=info, data=transitions)