Пример #1
0
def check_unbatched_time_step_spec(time_step, time_step_spec, batch_size):
  """Checks if time step conforms array spec, even if batched."""
  if batch_size is None:
    return array_spec.check_arrays_nest(time_step, time_step_spec)

  return array_spec.check_arrays_nest(
      time_step, array_spec.add_outer_dims_nest(time_step_spec, (batch_size,)))
Пример #2
0
def validate_py_environment(
    environment: py_environment.PyEnvironment,
    episodes: int = 5,
    observation_and_action_constraint_splitter: Optional[
        types.Splitter] = None):
    """Validates the environment follows the defined specs."""
    time_step_spec = environment.time_step_spec()
    action_spec = environment.action_spec()

    random_policy = random_py_policy.RandomPyPolicy(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        observation_and_action_constraint_splitter=(
            observation_and_action_constraint_splitter))

    if environment.batch_size is not None:
        batched_time_step_spec = array_spec.add_outer_dims_nest(
            time_step_spec, outer_dims=(environment.batch_size, ))
    else:
        batched_time_step_spec = time_step_spec

    episode_count = 0
    time_step = environment.reset()

    while episode_count < episodes:
        if not array_spec.check_arrays_nest(time_step, batched_time_step_spec):
            raise ValueError('Given `time_step`: %r does not match expected '
                             '`time_step_spec`: %r' %
                             (time_step, batched_time_step_spec))

        action = random_policy.action(time_step).action
        time_step = environment.step(action)

        episode_count += np.sum(time_step.is_last())
Пример #3
0
 def _create_obs_spec_fixed(self):
     spec = {
         GLOBAL_KEY:
         array_spec.ArraySpec.from_array(
             self._global_context_sampling_fn()),
         PER_ARM_KEY:
         array_spec.add_outer_dims_nest(
             array_spec.ArraySpec.from_array(
                 self._arm_context_sampling_fn()),
             (self._max_num_actions, ))
     }
     return spec
Пример #4
0
    def _as_dataset(self,
                    sample_batch_size=None,
                    num_steps=None,
                    sequence_preprocess_fn=None,
                    num_parallel_calls=None):
        if sequence_preprocess_fn is not None:
            raise NotImplementedError(
                'sequence_preprocess_fn is not supported.')
        if num_parallel_calls is not None:
            raise NotImplementedError('PyUniformReplayBuffer does not support '
                                      'num_parallel_calls (must be None).')

        data_spec = self._data_spec
        if sample_batch_size is not None:
            data_spec = array_spec.add_outer_dims_nest(data_spec,
                                                       (sample_batch_size, ))
        if num_steps is not None:
            data_spec = (data_spec, ) * num_steps
        shapes = tuple(s.shape for s in tf.nest.flatten(data_spec))
        dtypes = tuple(s.dtype for s in tf.nest.flatten(data_spec))

        def generator_fn():
            """Generator function."""
            while True:
                if sample_batch_size is not None:
                    batch = [
                        self._get_next(num_steps=num_steps, time_stacked=False)
                        for _ in range(sample_batch_size)
                    ]
                    item = nest_utils.stack_nested_arrays(batch)
                else:
                    item = self._get_next(num_steps=num_steps,
                                          time_stacked=False)
                yield tuple(tf.nest.flatten(item))

        def time_stack(*structures):
            time_axis = 0 if sample_batch_size is None else 1
            return tf.nest.map_structure(
                lambda *elements: tf.stack(elements, axis=time_axis),
                *structures)

        ds = tf.data.Dataset.from_generator(generator_fn, dtypes, shapes).map(
            lambda *items: tf.nest.pack_sequence_as(data_spec, items))
        if num_steps is not None:
            return ds.map(time_stack)
        else:
            return ds
Пример #5
0
 def _create_obs_spec_masked(self):
     mask_spec = array_spec.BoundedArraySpec(
         shape=(self._max_num_actions, ),
         dtype=np.int32,
         minimum=0,
         maximum=1)
     spec = ({
         GLOBAL_KEY:
         array_spec.ArraySpec.from_array(
             self._global_context_sampling_fn()),
         PER_ARM_KEY:
         array_spec.add_outer_dims_nest(
             array_spec.ArraySpec.from_array(
                 self._arm_context_sampling_fn()),
             (self._max_num_actions, ))
     }, mask_spec)
     return spec
Пример #6
0
 def _create_obs_spec_featured(self):
     num_actions_spec = array_spec.BoundedArraySpec(
         shape=(),
         dtype=np.dtype(type(self._num_actions_fn())),
         minimum=1,
         maximum=self._max_num_actions)
     spec = {
         GLOBAL_KEY:
         array_spec.ArraySpec.from_array(
             self._global_context_sampling_fn()),
         PER_ARM_KEY:
         array_spec.add_outer_dims_nest(
             array_spec.ArraySpec.from_array(
                 self._arm_context_sampling_fn()),
             (self._max_num_actions, )),
         NUM_ACTIONS_KEY:
         num_actions_spec
     }
     return spec
Пример #7
0
    def __init__(self,
                 global_context_sampling_fn,
                 arm_context_sampling_fn,
                 max_num_actions,
                 reward_fn,
                 num_actions_fn=None,
                 batch_size=1,
                 add_num_actions_feature=True):
        """Initializes the environment.

    In each round, global context is generated by global_context_sampling_fn,
    per-arm contexts are generated by arm_context_sampling_fn. The reward_fn
    function takes the concatenation of a gloabl and a per-arm feature, and
    outputs a possibly random reward.
    In case `num_action_fn` is specified, the number of actions will be dynamic.
    The actual number of actions can be encoded in two ways, depending on the
    value of `add_num_actions_feature`:

    If `add_num_actions_feature` is True then an extra feature key `num_actions`
    is added to the observation, with integer feature value indicating the
    number of available actions.

    If `add_num_actions_feature` is False, then the actually available actions
    are encoded by an action mask added to the observation in the format of
    `(observation, [1 1 ... 1 0 ... 0])`.

    Example:
      def global_context_sampling_fn():
        return np.random.randint(0, 10, [2])  # 2-dimensional global features.

      def arm_context_sampling_fn():
        return np.random.randint(-3, 4, [3])  # 3-dimensional arm features.

      def reward_fn(x):
        return sum(x)

      def num_actions_fn():
        return np.random.randint(2, 6)

      env = StationaryStochasticPerArmPyEnvironment(global_context_sampling_fn,
                                                    arm_context_sampling_fn,
                                                    5,
                                                    reward_fn,
                                                    num_actions_fn)

    Args:
      global_context_sampling_fn: A function that outputs a random 1d array or
        list of ints or floats. This output is the global context. Its shape and
        type must be consistent accross calls.
      arm_context_sampling_fn: A function that outputs a random 1 array or list
        of ints or floats (same type as the output of
        `global_context_sampling_fn`). This output is the per-arm context. Its
        shape must be consistent accross calls.
      max_num_actions: (int) the maximum number of actions in every sample. If
        `num_actions_fn` is not set, this many actions are available in every
        time step.
      reward_fn: A function that generates a reward when called with an
        observation.
      num_actions_fn: If set, it should be a function that outputs a single
        integer specifying the number of actions for a given time step. The
        value output by this function will be capped between 1 and
        `max_num_actions`. Depending on the value of `add_num_actions_feature`,
        the number of actions will be encoded by either the feature key
        `num_actions`, or an action mask '[1 1 ... 1 0 0 ... 0]'.
      batch_size: The batch size.
      add_num_actions_feature: (bool) If True (default), the number of actions
        is governed by the feature `num_actions`. If False, action masking is
        used to lower the number of actions for a given sample.
    """
        self._global_context_sampling_fn = global_context_sampling_fn
        self._arm_context_sampling_fn = arm_context_sampling_fn
        self._max_num_actions = max_num_actions
        self._reward_fn = reward_fn
        self._batch_size = batch_size
        self._num_actions_fn = num_actions_fn
        self._add_num_actions_feature = add_num_actions_feature

        observation_spec = {
            GLOBAL_KEY:
            array_spec.ArraySpec.from_array(global_context_sampling_fn()),
            PER_ARM_KEY:
            array_spec.add_outer_dims_nest(
                array_spec.ArraySpec.from_array(arm_context_sampling_fn()),
                (max_num_actions, ))
        }
        if self._num_actions_fn is not None:
            if self._add_num_actions_feature:
                num_actions_spec = array_spec.BoundedArraySpec(
                    shape=(),
                    dtype=np.dtype(type(self._num_actions_fn())),
                    minimum=1,
                    maximum=max_num_actions)
                observation_spec.update({NUM_ACTIONS_KEY: num_actions_spec})
            else:
                mask_spec = array_spec.BoundedArraySpec(
                    shape=(self._max_num_actions, ),
                    dtype=np.int32,
                    minimum=0,
                    maximum=1)
                observation_spec = (observation_spec, mask_spec)

        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  dtype=np.int32,
                                                  minimum=0,
                                                  maximum=max_num_actions - 1,
                                                  name='action')

        super(StationaryStochasticPerArmPyEnvironment,
              self).__init__(observation_spec, action_spec)
    def as_dataset(self, sample_batch_size=None, prioritized_buffer_beta=0.4, num_steps=None, num_parallel_calls=None):
        """
        build a tf Dataset which will be able to serve batches of experiences at scale

        Params:
            sample_batch_size: size of the batches it will return
            prioritized_buffer_beta: This a factor to which the Importance Sampling is present
            num_steps (int): number of steps to load. Only 1 is supported at the moment
            num_parallel_calls: number of calls to perform in parallel.

        Returns:
            tf dataset
        """
        if num_parallel_calls is not None:
            raise NotImplementedError('PyUniformReplayBuffer does not support num_parallel_calls (must be None).')

        data_spec = self._data_spec

        num_steps_value = num_steps if num_steps is not None else 1

        if num_steps_value != 1:
            raise NotImplementedError('PyPrioritizedReplayBuffer only supports a batches with num_step '
                                      'size of 1, but received batch with num_steps'
                                      'size {}.'.format(num_steps_value))

        if sample_batch_size is not None:
            data_spec = array_spec.add_outer_dims_nest(data_spec, (sample_batch_size,))

        experiences_shapes = tuple(s.shape for s in tf.nest.flatten(data_spec))
        experiences_dtypes = tuple(s.dtype for s in tf.nest.flatten(data_spec))

        indices_shape = (sample_batch_size,) if sample_batch_size else ()
        indices_dtype = np.int32

        weight_shape = (sample_batch_size,) if sample_batch_size else ()
        weight_dtypes = np.float32

        shapes = {"experiences": experiences_shapes, "indices": indices_shape, "weights": weight_shape}
        dtypes = {"experiences": experiences_dtypes, "indices": indices_dtype, "weights": weight_dtypes}

        def generator_fn():
            while True:
                if sample_batch_size is not None:
                    batch = [self.get_next(num_steps=num_steps_value, time_stacked=False,
                                           prioritized_buffer_beta=prioritized_buffer_beta)
                             for _ in range(sample_batch_size)]
                    item, item_idx, item_weight = nest_utils.stack_nested_arrays(batch)
                else:
                    item, item_idx, item_weight = self.get_next(num_steps=num_steps_value, time_stacked=False,
                                                                prioritized_buffer_beta=prioritized_buffer_beta)

                yield {"experiences": tuple(tf.nest.flatten(item)), "indices": item_idx, "weights": item_weight}

        def pack_items(*items):
            if len(items) > 0:
                experience = tf.nest.pack_sequence_as(data_spec, items[0]["experiences"])
                return experience, items[0]["indices"], items[0]["weights"]
            else:
                raise Exception("No items to pack")

        ds = tf.data.Dataset.from_generator(generator_fn, dtypes, shapes).map(pack_items)

        return ds
    def __init__(self,
                 global_context_sampling_fn: Callable[[], types.Array],
                 arm_context_sampling_fn: Callable[[], types.Array],
                 num_actions: int,
                 reward_fn: Callable[[types.Array], Sequence[float]],
                 batch_size: Optional[int] = 1,
                 name: Optional[Text] = 'stationary_stochastic_structured'):
        """Initializes the environment.

    In each round, global context is generated by global_context_sampling_fn,
    per-arm contexts are generated by arm_context_sampling_fn.

    The two feature generating functions should output a single observation, not
    including either the batch_size or the number of actions.

    The reward_fn function takes a global and a per-arm feature, and outputs a
    possibly random reward.

    Example:
      def global_context_sampling_fn():
        return np.random.randint(0, 10, [2])  # 2-dimensional global features.

      def arm_context_sampling_fn():
        return {'armf1': np.random.randint(-3, 4, [3]),    # A dictionary of
                'armf2': np.random.randint(0, 2, [4, 5])}  # arm features.

      def reward_fn(global, arm):
        return sum(global) + arm['armf1'][0] + arm['armf2'][3, 3]

      env = StationaryStochasticPyEnvironment(global_context_sampling_fn,
                                              arm_context_sampling_fn,
                                              5,
                                              reward_fn,
                                              batch_size=5)

    Args:
      global_context_sampling_fn: A function that outputs a possibly nested
        structure of features. This output is the global context. Its shapes and
        types must be consistent accross calls.
      arm_context_sampling_fn: A function that outputs a possibly nested
        structure of features. This output is the per-arm context. Its shapes
        must be consistent accross calls.
      num_actions: (int) the number of actions in every sample.
      reward_fn: A function that generates a reward when called with a global
        and a per-arm observation.
      batch_size: The batch size.
      name: The name of this environment instance.
    """
        self._global_context_sampling_fn = global_context_sampling_fn
        self._arm_context_sampling_fn = arm_context_sampling_fn
        self._num_actions = num_actions
        self._reward_fn = reward_fn
        self._batch_size = batch_size

        global_example = global_context_sampling_fn()
        arm_example = arm_context_sampling_fn()
        observation_spec = {
            GLOBAL_KEY:
            tf.nest.map_structure(array_spec.ArraySpec.from_array,
                                  global_example),
            PER_ARM_KEY:
            array_spec.add_outer_dims_nest(
                tf.nest.map_structure(array_spec.ArraySpec.from_array,
                                      arm_example), (num_actions, ))
        }

        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  dtype=np.int32,
                                                  minimum=0,
                                                  maximum=num_actions - 1,
                                                  name='action')

        super(StationaryStochasticStructuredPyEnvironment,
              self).__init__(observation_spec, action_spec, name=name)
    def __init__(self,
                 global_context_sampling_fn: Callable[[], types.Array],
                 arm_context_sampling_fn: Callable[[], types.Array],
                 max_num_actions: int,
                 reward_fn: Callable[[types.Array], Sequence[float]],
                 num_actions_fn: Optional[Callable[[], int]] = None,
                 batch_size: Optional[int] = 1):
        """Initializes the environment.

    In each round, global context is generated by global_context_sampling_fn,
    per-arm contexts are generated by arm_context_sampling_fn. The reward_fn
    function takes the concatenation of a global and a per-arm feature, and
    outputs a possibly random reward.
    In case `num_action_fn` is specified, the number of actions will be dynamic
    and a `num_actions` feature key indicates the number of actions in any given
    sample.

    Example:
      def global_context_sampling_fn():
        return np.random.randint(0, 10, [2])  # 2-dimensional global features.

      def arm_context_sampling_fn():
        return np.random.randint(-3, 4, [3])  # 3-dimensional arm features.

      def reward_fn(x):
        return sum(x)

      def num_actions_fn():
        return np.random.randint(2, 6)

      env = StationaryStochasticPerArmPyEnvironment(global_context_sampling_fn,
                                                    arm_context_sampling_fn,
                                                    5,
                                                    reward_fn,
                                                    num_actions_fn)

    Args:
      global_context_sampling_fn: A function that outputs a random 1d array or
        list of ints or floats. This output is the global context. Its shape and
        type must be consistent across calls.
      arm_context_sampling_fn: A function that outputs a random 1 array or list
        of ints or floats (same type as the output of
        `global_context_sampling_fn`). This output is the per-arm context. Its
        shape must be consistent across calls.
      max_num_actions: (int) the maximum number of actions in every sample. If
        `num_actions_fn` is not set, this many actions are available in every
        time step.
      reward_fn: A function that generates a reward when called with an
        observation.
      num_actions_fn: If set, it should be a function that outputs a single
        integer specifying the number of actions for a given time step. The
        value output by this function will be capped between 1 and
        `max_num_actions`. The number of actions will be encoded in the
        observation by the feature key `num_actions`.
      batch_size: The batch size.
    """
        self._global_context_sampling_fn = global_context_sampling_fn
        self._arm_context_sampling_fn = arm_context_sampling_fn
        self._max_num_actions = max_num_actions
        self._reward_fn = reward_fn
        self._batch_size = batch_size
        self._num_actions_fn = num_actions_fn

        observation_spec = {
            GLOBAL_KEY:
            array_spec.ArraySpec.from_array(global_context_sampling_fn()),
            PER_ARM_KEY:
            array_spec.add_outer_dims_nest(
                array_spec.ArraySpec.from_array(arm_context_sampling_fn()),
                (max_num_actions, ))
        }
        if self._num_actions_fn is not None:
            num_actions_spec = array_spec.BoundedArraySpec(
                shape=(),
                dtype=np.dtype(type(self._num_actions_fn())),
                minimum=1,
                maximum=max_num_actions)
            observation_spec.update({NUM_ACTIONS_KEY: num_actions_spec})

        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  dtype=np.int32,
                                                  minimum=0,
                                                  maximum=max_num_actions - 1,
                                                  name='action')

        super(StationaryStochasticPerArmPyEnvironment,
              self).__init__(observation_spec, action_spec)
    def __init__(self,
                 global_context_sampling_fn,
                 arm_context_sampling_fn,
                 num_actions,
                 reward_fn,
                 batch_size=1):
        """Initializes the environment.

    In each round, global context is generated by global_context_sampling_fn,
    per-arm contexts are generated by arm_context_sampling_fn. The reward_fn
    function takes the concatenation of a gloabl and a per-arm feature, and
    outputs a possibly random reward.

    Example:
      def global_context_sampling_fn():
        return np.random.randint(0, 10, [2])  # 2-dimensional global features.

      def arm_context_sampling_fn():
        return np.random.randint(-3, 4, [3])  # 3-dimensional arm features.

      def reward_fn(x):
        return sum(x)

      env = StationaryStochasticPyEnvironment(global_context_sampling_fn,
                                              arm_context_sampling_fn,
                                              5,
                                              reward_fn)

    Args:
      global_context_sampling_fn: A function that outputs a random 1d array or
        list of ints or floats. This output is the global context. Its shape and
        type must be consistent accross calls.
      arm_context_sampling_fn: A function that outputs a random 1 array or list
        of ints or floats (same type as the output of
        `global_context_sampling_fn`). This output is the per-arm context. Its
        shape must be consistent accross calls.
      num_actions: (int) the number of actions in every sample.
      reward_fn: A function that generates a reward when called with an
        observation.
      batch_size: The batch size.
    """
        self._global_context_sampling_fn = global_context_sampling_fn
        self._arm_context_sampling_fn = arm_context_sampling_fn
        self._num_actions = num_actions
        self._reward_fn = reward_fn
        self._batch_size = batch_size

        observation_spec = {
            GLOBAL_KEY:
            array_spec.ArraySpec.from_array(global_context_sampling_fn()),
            PER_ARM_KEY:
            array_spec.add_outer_dims_nest(
                array_spec.ArraySpec.from_array(arm_context_sampling_fn()),
                (num_actions, ))
        }

        action_spec = array_spec.BoundedArraySpec(shape=(),
                                                  dtype=np.int32,
                                                  minimum=0,
                                                  maximum=num_actions,
                                                  name='action')

        super(StationaryStochasticPerArmPyEnvironment,
              self).__init__(observation_spec, action_spec)