Пример #1
0
def _create_trajectory(
    observation,
    action,
    policy_info,
    reward,
    discount,
    step_type,
    next_step_type,
    name_scope):
  """Create a Trajectory composed of either Tensors or numpy arrays.

  The input `discount` is used to infer the outer shape of the inputs,
  as it is always expected to be a singleton array with scalar inner shape.

  Args:
    observation: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`.
    action: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`.
    policy_info: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`.
    reward: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`.
    discount: A floating point vector `Tensor` or `np.ndarray`;
      shaped `[B]`, `[T]`, or `[B, T]` (optional).
    step_type: `Tensor` or `np.ndarray` of `ts.StepType`,
      shaped `[B]`, `[T]`, or `[B, T]`.
    next_step_type: `Tensor` or `np.ndarray` of `ts.StepType`,
      shaped `[B]`, `[T]`, or `[B, T]`.
    name_scope: Python string, name to use when creating tensors.

  Returns:
    A `Trajectory` instance.
  """
  if nest_utils.has_tensors(
      observation, action, policy_info, reward, discount):
    with tf.name_scope(name_scope):
      discount = tf.identity(discount)
      shape = tf.shape(input=discount)
      make_tensors = lambda struct: tf.nest.map_structure(tf.identity, struct)
      return Trajectory(
          step_type=tf.fill(shape, step_type),
          observation=make_tensors(observation),
          action=make_tensors(action),
          policy_info=make_tensors(policy_info),
          next_step_type=tf.fill(shape, next_step_type),
          reward=make_tensors(reward),
          discount=discount)
  else:
    discount = np.asarray(discount)
    shape = discount.shape
    make_arrays = lambda struct: tf.nest.map_structure(np.asarray, struct)
    return Trajectory(
        step_type=np.full(shape, step_type),
        observation=make_arrays(observation),
        action=make_arrays(action),
        policy_info=make_arrays(policy_info),
        next_step_type=np.full(shape, next_step_type),
        reward=make_arrays(reward),
        discount=discount)
Пример #2
0
def from_episode(observation, action, policy_info, reward, discount=None):
    """Create a Trajectory from tensors representing a single episode.

  If none of the inputs are tensors, then numpy arrays are generated instead.

  If `discount` is not provided, the first entry in `reward` is used to estimate
  `T`:

  ```
  reward_0 = tf.nest.flatten(reward)[0]
  T = shape(reward_0)[0]
  ```

  In this case, a `discount` of all ones having dtype `float32` is generated.

  Args:
    observation: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[T, ...]`.
    action: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[T, ...]`.
    policy_info: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[T, ...]`.
    reward: (possibly nested tuple of) `Tensor` or `np.ndarray`;
      all shaped `[T, ...]`.
    discount: A floating point vector `Tensor` or `np.ndarray`;
      shaped `[T]` (optional).

  Returns:
    An instance of `Trajectory`.
  """
    use_tensors = nest_utils.has_tensors(observation, action, policy_info,
                                         reward, discount)
    if use_tensors:
        ones_fn = tf.ones
        float_dtype = tf.float32
        convert_fn = tf.convert_to_tensor
        concat_fn = tf.concat
        maximum_fn = tf.maximum
        fill_fn = tf.fill
        identity_map = lambda struct: tf.nest.map_structure(
            tf.identity, struct)
    else:
        ones_fn = np.ones
        float_dtype = np.float32
        convert_fn = np.asarray
        concat_fn = np.concatenate
        maximum_fn = np.maximum
        fill_fn = np.full
        identity_map = lambda struct: tf.nest.map_structure(np.asarray, struct)

    def _from_episode(observation, action, policy_info, reward, discount):
        """Implementation of from_episode."""
        if discount is not None:
            time_source = discount
        else:
            time_source = tf.nest.flatten(reward)[0]
        if tf.is_tensor(time_source):
            num_frames = (tf.compat.dimension_value(time_source.shape[0])
                          or tf.shape(input=time_source)[0])
        else:
            num_frames = np.shape(time_source)[0]
        if discount is None:
            discount = ones_fn([num_frames], dtype=float_dtype)

        if not tf.is_tensor(num_frames):

            def check_num_frames(t):
                if t.shape[0] is not None and t.shape[0] != num_frames:
                    raise ValueError('Expected first dimension to be {}, '
                                     'but saw value: {}'.format(num_frames, t))

            tf.nest.map_structure(
                check_num_frames,
                (observation, action, policy_info, reward, discount))

        ts_first = convert_fn(ts.StepType.FIRST)
        ts_last = convert_fn(ts.StepType.LAST)
        mid_size = maximum_fn(0, num_frames - 1)
        ts_mid = fill_fn([mid_size], ts.StepType.MID)
        step_type = concat_fn(([ts_first], ts_mid), axis=0)
        next_step_type = concat_fn((ts_mid, [ts_last]), axis=0)

        return Trajectory(step_type=step_type,
                          observation=identity_map(observation),
                          action=identity_map(action),
                          policy_info=identity_map(policy_info),
                          next_step_type=next_step_type,
                          reward=identity_map(reward),
                          discount=identity_map(discount))

    if use_tensors:
        with tf.name_scope('from_episode'):
            return _from_episode(observation, action, policy_info, reward,
                                 discount)
    else:
        return _from_episode(observation, action, policy_info, reward,
                             discount)
Пример #3
0
def from_episode(
    observation: types.NestedSpecTensorOrArray,
    action: types.NestedSpecTensorOrArray,
    policy_info: types.NestedSpecTensorOrArray,
    reward: types.NestedSpecTensorOrArray,
    discount: Optional[types.SpecTensorOrArray] = None) -> Trajectory:
  """Create a Trajectory from tensors representing a single episode.

  If none of the inputs are tensors, then numpy arrays are generated instead.

  If `discount` is not provided, the first entry in `reward` is used to estimate
  `T`:

  ```
  reward_0 = tf.nest.flatten(reward)[0]
  T = shape(reward_0)[0]
  ```

  In this case, a `discount` of all ones having dtype `float32` is generated.

  **NOTE**: all tensors/numpy arrays passed to this function have the same time
  dimension `T`. When the generated trajectory passes through `to_transition`,
  it will only return a `(time_steps, next_time_steps)` pair with `T - 1` in the
  time dimension, which means the reward at step T is dropped. So if the reward
  at step `T` is important, please make sure the episode passed to this function
  contains an additional step.

  Args:
    observation: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped
      `[T, ...]`.
    action: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped
      `[T, ...]`.
    policy_info: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped
      `[T, ...]`.
    reward: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped
      `[T, ...]`.
    discount: A floating point vector `Tensor` or `np.ndarray`; shaped
      `[T]` (optional).

  Returns:
    An instance of `Trajectory`.
  """
  use_tensors = nest_utils.has_tensors(
      observation, action, policy_info, reward, discount)
  map_structure = functools.partial(
      tf.nest.map_structure, expand_composites=True)
  if use_tensors:
    ones_fn = tf.ones
    float_dtype = tf.float32
    convert_fn = tf.convert_to_tensor
    concat_fn = tf.concat
    maximum_fn = tf.maximum
    fill_fn = tf.fill
    identity_map = lambda struct: map_structure(tf.identity, struct)
  else:
    ones_fn = np.ones
    float_dtype = np.float32
    convert_fn = np.asarray
    concat_fn = np.concatenate
    maximum_fn = np.maximum
    fill_fn = np.full
    identity_map = lambda struct: map_structure(np.asarray, struct)

  def _from_episode(observation, action, policy_info, reward, discount):
    """Implementation of from_episode."""
    if discount is not None:
      time_source = discount
    else:
      time_source = tf.nest.flatten(reward)[0]
    if tf.is_tensor(time_source):
      num_frames = _maybe_static_outer_dim(time_source)
    else:
      num_frames = np.shape(time_source)[0]
    if discount is None:
      discount = ones_fn([num_frames], dtype=float_dtype)

    if not tf.is_tensor(num_frames):

      def check_num_frames(t):
        if tf.is_tensor(t):
          outer_dim = _maybe_static_outer_dim(t)
        else:
          outer_dim = t.shape[0]
        if not tf.is_tensor(outer_dim) and outer_dim != num_frames:
          raise ValueError('Expected first dimension to be {}, '
                           'but saw outer dim: {}'.format(num_frames,
                                                          outer_dim))

      tf.nest.map_structure(
          check_num_frames,
          (observation, action, policy_info, reward, discount),
          expand_composites=False)

    ts_first = convert_fn(ts.StepType.FIRST)
    ts_last = convert_fn(ts.StepType.LAST)
    mid_size = maximum_fn(0, num_frames - 1)
    ts_mid = fill_fn([mid_size], ts.StepType.MID)
    step_type = concat_fn(([ts_first], ts_mid), axis=0)
    next_step_type = concat_fn((ts_mid, [ts_last]), axis=0)

    return Trajectory(
        step_type=step_type,
        observation=identity_map(observation),
        action=identity_map(action),
        policy_info=identity_map(policy_info),
        next_step_type=next_step_type,
        reward=identity_map(reward),
        discount=identity_map(discount))

  if use_tensors:
    with tf.name_scope('from_episode'):
      return _from_episode(observation, action, policy_info, reward, discount)
  else:
    return _from_episode(observation, action, policy_info, reward, discount)