def check_unbatched_time_step_spec(time_step, time_step_spec, batch_size): """Checks if time step conforms array spec, even if batched.""" if batch_size is None: return array_spec.check_arrays_nest(time_step, time_step_spec) return array_spec.check_arrays_nest( time_step, array_spec.add_outer_dims_nest(time_step_spec, (batch_size,)))
def validate_py_environment( environment: py_environment.PyEnvironment, episodes: int = 5, observation_and_action_constraint_splitter: Optional[ types.Splitter] = None): """Validates the environment follows the defined specs.""" time_step_spec = environment.time_step_spec() action_spec = environment.action_spec() random_policy = random_py_policy.RandomPyPolicy( time_step_spec=time_step_spec, action_spec=action_spec, observation_and_action_constraint_splitter=( observation_and_action_constraint_splitter)) if environment.batch_size is not None: batched_time_step_spec = array_spec.add_outer_dims_nest( time_step_spec, outer_dims=(environment.batch_size, )) else: batched_time_step_spec = time_step_spec episode_count = 0 time_step = environment.reset() while episode_count < episodes: if not array_spec.check_arrays_nest(time_step, batched_time_step_spec): raise ValueError('Given `time_step`: %r does not match expected ' '`time_step_spec`: %r' % (time_step, batched_time_step_spec)) action = random_policy.action(time_step).action time_step = environment.step(action) episode_count += np.sum(time_step.is_last())
def _create_obs_spec_fixed(self): spec = { GLOBAL_KEY: array_spec.ArraySpec.from_array( self._global_context_sampling_fn()), PER_ARM_KEY: array_spec.add_outer_dims_nest( array_spec.ArraySpec.from_array( self._arm_context_sampling_fn()), (self._max_num_actions, )) } return spec
def _as_dataset(self, sample_batch_size=None, num_steps=None, sequence_preprocess_fn=None, num_parallel_calls=None): if sequence_preprocess_fn is not None: raise NotImplementedError( 'sequence_preprocess_fn is not supported.') if num_parallel_calls is not None: raise NotImplementedError('PyUniformReplayBuffer does not support ' 'num_parallel_calls (must be None).') data_spec = self._data_spec if sample_batch_size is not None: data_spec = array_spec.add_outer_dims_nest(data_spec, (sample_batch_size, )) if num_steps is not None: data_spec = (data_spec, ) * num_steps shapes = tuple(s.shape for s in tf.nest.flatten(data_spec)) dtypes = tuple(s.dtype for s in tf.nest.flatten(data_spec)) def generator_fn(): """Generator function.""" while True: if sample_batch_size is not None: batch = [ self._get_next(num_steps=num_steps, time_stacked=False) for _ in range(sample_batch_size) ] item = nest_utils.stack_nested_arrays(batch) else: item = self._get_next(num_steps=num_steps, time_stacked=False) yield tuple(tf.nest.flatten(item)) def time_stack(*structures): time_axis = 0 if sample_batch_size is None else 1 return tf.nest.map_structure( lambda *elements: tf.stack(elements, axis=time_axis), *structures) ds = tf.data.Dataset.from_generator(generator_fn, dtypes, shapes).map( lambda *items: tf.nest.pack_sequence_as(data_spec, items)) if num_steps is not None: return ds.map(time_stack) else: return ds
def _create_obs_spec_masked(self): mask_spec = array_spec.BoundedArraySpec( shape=(self._max_num_actions, ), dtype=np.int32, minimum=0, maximum=1) spec = ({ GLOBAL_KEY: array_spec.ArraySpec.from_array( self._global_context_sampling_fn()), PER_ARM_KEY: array_spec.add_outer_dims_nest( array_spec.ArraySpec.from_array( self._arm_context_sampling_fn()), (self._max_num_actions, )) }, mask_spec) return spec
def _create_obs_spec_featured(self): num_actions_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.dtype(type(self._num_actions_fn())), minimum=1, maximum=self._max_num_actions) spec = { GLOBAL_KEY: array_spec.ArraySpec.from_array( self._global_context_sampling_fn()), PER_ARM_KEY: array_spec.add_outer_dims_nest( array_spec.ArraySpec.from_array( self._arm_context_sampling_fn()), (self._max_num_actions, )), NUM_ACTIONS_KEY: num_actions_spec } return spec
def __init__(self, global_context_sampling_fn, arm_context_sampling_fn, max_num_actions, reward_fn, num_actions_fn=None, batch_size=1, add_num_actions_feature=True): """Initializes the environment. In each round, global context is generated by global_context_sampling_fn, per-arm contexts are generated by arm_context_sampling_fn. The reward_fn function takes the concatenation of a gloabl and a per-arm feature, and outputs a possibly random reward. In case `num_action_fn` is specified, the number of actions will be dynamic. The actual number of actions can be encoded in two ways, depending on the value of `add_num_actions_feature`: If `add_num_actions_feature` is True then an extra feature key `num_actions` is added to the observation, with integer feature value indicating the number of available actions. If `add_num_actions_feature` is False, then the actually available actions are encoded by an action mask added to the observation in the format of `(observation, [1 1 ... 1 0 ... 0])`. Example: def global_context_sampling_fn(): return np.random.randint(0, 10, [2]) # 2-dimensional global features. def arm_context_sampling_fn(): return np.random.randint(-3, 4, [3]) # 3-dimensional arm features. def reward_fn(x): return sum(x) def num_actions_fn(): return np.random.randint(2, 6) env = StationaryStochasticPerArmPyEnvironment(global_context_sampling_fn, arm_context_sampling_fn, 5, reward_fn, num_actions_fn) Args: global_context_sampling_fn: A function that outputs a random 1d array or list of ints or floats. This output is the global context. Its shape and type must be consistent accross calls. arm_context_sampling_fn: A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). This output is the per-arm context. Its shape must be consistent accross calls. max_num_actions: (int) the maximum number of actions in every sample. If `num_actions_fn` is not set, this many actions are available in every time step. reward_fn: A function that generates a reward when called with an observation. num_actions_fn: If set, it should be a function that outputs a single integer specifying the number of actions for a given time step. The value output by this function will be capped between 1 and `max_num_actions`. Depending on the value of `add_num_actions_feature`, the number of actions will be encoded by either the feature key `num_actions`, or an action mask '[1 1 ... 1 0 0 ... 0]'. batch_size: The batch size. add_num_actions_feature: (bool) If True (default), the number of actions is governed by the feature `num_actions`. If False, action masking is used to lower the number of actions for a given sample. """ self._global_context_sampling_fn = global_context_sampling_fn self._arm_context_sampling_fn = arm_context_sampling_fn self._max_num_actions = max_num_actions self._reward_fn = reward_fn self._batch_size = batch_size self._num_actions_fn = num_actions_fn self._add_num_actions_feature = add_num_actions_feature observation_spec = { GLOBAL_KEY: array_spec.ArraySpec.from_array(global_context_sampling_fn()), PER_ARM_KEY: array_spec.add_outer_dims_nest( array_spec.ArraySpec.from_array(arm_context_sampling_fn()), (max_num_actions, )) } if self._num_actions_fn is not None: if self._add_num_actions_feature: num_actions_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.dtype(type(self._num_actions_fn())), minimum=1, maximum=max_num_actions) observation_spec.update({NUM_ACTIONS_KEY: num_actions_spec}) else: mask_spec = array_spec.BoundedArraySpec( shape=(self._max_num_actions, ), dtype=np.int32, minimum=0, maximum=1) observation_spec = (observation_spec, mask_spec) action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=max_num_actions - 1, name='action') super(StationaryStochasticPerArmPyEnvironment, self).__init__(observation_spec, action_spec)
def as_dataset(self, sample_batch_size=None, prioritized_buffer_beta=0.4, num_steps=None, num_parallel_calls=None): """ build a tf Dataset which will be able to serve batches of experiences at scale Params: sample_batch_size: size of the batches it will return prioritized_buffer_beta: This a factor to which the Importance Sampling is present num_steps (int): number of steps to load. Only 1 is supported at the moment num_parallel_calls: number of calls to perform in parallel. Returns: tf dataset """ if num_parallel_calls is not None: raise NotImplementedError('PyUniformReplayBuffer does not support num_parallel_calls (must be None).') data_spec = self._data_spec num_steps_value = num_steps if num_steps is not None else 1 if num_steps_value != 1: raise NotImplementedError('PyPrioritizedReplayBuffer only supports a batches with num_step ' 'size of 1, but received batch with num_steps' 'size {}.'.format(num_steps_value)) if sample_batch_size is not None: data_spec = array_spec.add_outer_dims_nest(data_spec, (sample_batch_size,)) experiences_shapes = tuple(s.shape for s in tf.nest.flatten(data_spec)) experiences_dtypes = tuple(s.dtype for s in tf.nest.flatten(data_spec)) indices_shape = (sample_batch_size,) if sample_batch_size else () indices_dtype = np.int32 weight_shape = (sample_batch_size,) if sample_batch_size else () weight_dtypes = np.float32 shapes = {"experiences": experiences_shapes, "indices": indices_shape, "weights": weight_shape} dtypes = {"experiences": experiences_dtypes, "indices": indices_dtype, "weights": weight_dtypes} def generator_fn(): while True: if sample_batch_size is not None: batch = [self.get_next(num_steps=num_steps_value, time_stacked=False, prioritized_buffer_beta=prioritized_buffer_beta) for _ in range(sample_batch_size)] item, item_idx, item_weight = nest_utils.stack_nested_arrays(batch) else: item, item_idx, item_weight = self.get_next(num_steps=num_steps_value, time_stacked=False, prioritized_buffer_beta=prioritized_buffer_beta) yield {"experiences": tuple(tf.nest.flatten(item)), "indices": item_idx, "weights": item_weight} def pack_items(*items): if len(items) > 0: experience = tf.nest.pack_sequence_as(data_spec, items[0]["experiences"]) return experience, items[0]["indices"], items[0]["weights"] else: raise Exception("No items to pack") ds = tf.data.Dataset.from_generator(generator_fn, dtypes, shapes).map(pack_items) return ds
def __init__(self, global_context_sampling_fn: Callable[[], types.Array], arm_context_sampling_fn: Callable[[], types.Array], num_actions: int, reward_fn: Callable[[types.Array], Sequence[float]], batch_size: Optional[int] = 1, name: Optional[Text] = 'stationary_stochastic_structured'): """Initializes the environment. In each round, global context is generated by global_context_sampling_fn, per-arm contexts are generated by arm_context_sampling_fn. The two feature generating functions should output a single observation, not including either the batch_size or the number of actions. The reward_fn function takes a global and a per-arm feature, and outputs a possibly random reward. Example: def global_context_sampling_fn(): return np.random.randint(0, 10, [2]) # 2-dimensional global features. def arm_context_sampling_fn(): return {'armf1': np.random.randint(-3, 4, [3]), # A dictionary of 'armf2': np.random.randint(0, 2, [4, 5])} # arm features. def reward_fn(global, arm): return sum(global) + arm['armf1'][0] + arm['armf2'][3, 3] env = StationaryStochasticPyEnvironment(global_context_sampling_fn, arm_context_sampling_fn, 5, reward_fn, batch_size=5) Args: global_context_sampling_fn: A function that outputs a possibly nested structure of features. This output is the global context. Its shapes and types must be consistent accross calls. arm_context_sampling_fn: A function that outputs a possibly nested structure of features. This output is the per-arm context. Its shapes must be consistent accross calls. num_actions: (int) the number of actions in every sample. reward_fn: A function that generates a reward when called with a global and a per-arm observation. batch_size: The batch size. name: The name of this environment instance. """ self._global_context_sampling_fn = global_context_sampling_fn self._arm_context_sampling_fn = arm_context_sampling_fn self._num_actions = num_actions self._reward_fn = reward_fn self._batch_size = batch_size global_example = global_context_sampling_fn() arm_example = arm_context_sampling_fn() observation_spec = { GLOBAL_KEY: tf.nest.map_structure(array_spec.ArraySpec.from_array, global_example), PER_ARM_KEY: array_spec.add_outer_dims_nest( tf.nest.map_structure(array_spec.ArraySpec.from_array, arm_example), (num_actions, )) } action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=num_actions - 1, name='action') super(StationaryStochasticStructuredPyEnvironment, self).__init__(observation_spec, action_spec, name=name)
def __init__(self, global_context_sampling_fn: Callable[[], types.Array], arm_context_sampling_fn: Callable[[], types.Array], max_num_actions: int, reward_fn: Callable[[types.Array], Sequence[float]], num_actions_fn: Optional[Callable[[], int]] = None, batch_size: Optional[int] = 1): """Initializes the environment. In each round, global context is generated by global_context_sampling_fn, per-arm contexts are generated by arm_context_sampling_fn. The reward_fn function takes the concatenation of a global and a per-arm feature, and outputs a possibly random reward. In case `num_action_fn` is specified, the number of actions will be dynamic and a `num_actions` feature key indicates the number of actions in any given sample. Example: def global_context_sampling_fn(): return np.random.randint(0, 10, [2]) # 2-dimensional global features. def arm_context_sampling_fn(): return np.random.randint(-3, 4, [3]) # 3-dimensional arm features. def reward_fn(x): return sum(x) def num_actions_fn(): return np.random.randint(2, 6) env = StationaryStochasticPerArmPyEnvironment(global_context_sampling_fn, arm_context_sampling_fn, 5, reward_fn, num_actions_fn) Args: global_context_sampling_fn: A function that outputs a random 1d array or list of ints or floats. This output is the global context. Its shape and type must be consistent across calls. arm_context_sampling_fn: A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). This output is the per-arm context. Its shape must be consistent across calls. max_num_actions: (int) the maximum number of actions in every sample. If `num_actions_fn` is not set, this many actions are available in every time step. reward_fn: A function that generates a reward when called with an observation. num_actions_fn: If set, it should be a function that outputs a single integer specifying the number of actions for a given time step. The value output by this function will be capped between 1 and `max_num_actions`. The number of actions will be encoded in the observation by the feature key `num_actions`. batch_size: The batch size. """ self._global_context_sampling_fn = global_context_sampling_fn self._arm_context_sampling_fn = arm_context_sampling_fn self._max_num_actions = max_num_actions self._reward_fn = reward_fn self._batch_size = batch_size self._num_actions_fn = num_actions_fn observation_spec = { GLOBAL_KEY: array_spec.ArraySpec.from_array(global_context_sampling_fn()), PER_ARM_KEY: array_spec.add_outer_dims_nest( array_spec.ArraySpec.from_array(arm_context_sampling_fn()), (max_num_actions, )) } if self._num_actions_fn is not None: num_actions_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.dtype(type(self._num_actions_fn())), minimum=1, maximum=max_num_actions) observation_spec.update({NUM_ACTIONS_KEY: num_actions_spec}) action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=max_num_actions - 1, name='action') super(StationaryStochasticPerArmPyEnvironment, self).__init__(observation_spec, action_spec)
def __init__(self, global_context_sampling_fn, arm_context_sampling_fn, num_actions, reward_fn, batch_size=1): """Initializes the environment. In each round, global context is generated by global_context_sampling_fn, per-arm contexts are generated by arm_context_sampling_fn. The reward_fn function takes the concatenation of a gloabl and a per-arm feature, and outputs a possibly random reward. Example: def global_context_sampling_fn(): return np.random.randint(0, 10, [2]) # 2-dimensional global features. def arm_context_sampling_fn(): return np.random.randint(-3, 4, [3]) # 3-dimensional arm features. def reward_fn(x): return sum(x) env = StationaryStochasticPyEnvironment(global_context_sampling_fn, arm_context_sampling_fn, 5, reward_fn) Args: global_context_sampling_fn: A function that outputs a random 1d array or list of ints or floats. This output is the global context. Its shape and type must be consistent accross calls. arm_context_sampling_fn: A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). This output is the per-arm context. Its shape must be consistent accross calls. num_actions: (int) the number of actions in every sample. reward_fn: A function that generates a reward when called with an observation. batch_size: The batch size. """ self._global_context_sampling_fn = global_context_sampling_fn self._arm_context_sampling_fn = arm_context_sampling_fn self._num_actions = num_actions self._reward_fn = reward_fn self._batch_size = batch_size observation_spec = { GLOBAL_KEY: array_spec.ArraySpec.from_array(global_context_sampling_fn()), PER_ARM_KEY: array_spec.add_outer_dims_nest( array_spec.ArraySpec.from_array(arm_context_sampling_fn()), (num_actions, )) } action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=num_actions, name='action') super(StationaryStochasticPerArmPyEnvironment, self).__init__(observation_spec, action_spec)