def testBatch(self, metric_class, expected_result): metric = metric_class() metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 1., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.mid((), (), (), 2., 1.), trajectory.last((), (), (), 3., 0.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.last((), (), (), 3., 0.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.first((), (), (), 1., 1.) ])) self.assertEqual(expected_result, metric.result(), 5.0)
def get_single(): """Gets a single item from the replay buffer.""" with self._lock: if self._np_state.size <= 0: def empty_item(spec): return np.empty(spec.shape, dtype=spec.dtype) if num_steps is not None: item = [tf.nest.map_structure(empty_item, self.data_spec) for n in range(num_steps)] if time_stacked: item = nest_utils.stack_nested_arrays(item) else: item = tf.nest.map_structure(empty_item, self.data_spec) return item idx = np.random.randint(self._np_state.size - num_steps_value + 1) if self._np_state.size == self._capacity: # If the buffer is full, add cur_id (head of circular buffer) so that # we sample from the range [cur_id, cur_id + size - num_steps_value]. # We will modulo the size below. idx += self._np_state.cur_id if num_steps is not None: # TODO(b/120242830): Try getting data from numpy in one shot rather # than num_steps_value. item = [self._decode(self._storage.get((idx + n) % self._capacity)) for n in range(num_steps)] else: item = self._decode(self._storage.get(idx % self._capacity)) if num_steps is not None and time_stacked: item = nest_utils.stack_nested_arrays(item) return item
def testBatchSizeProvided(self, metric_class, expected_result): metric = py_metrics.AverageReturnMetric(batch_size=2) metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 1., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.mid((), (), (), 2., 1.), trajectory.last((), (), (), 3., 0.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.last((), (), (), 3., 0.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.first((), (), (), 1., 1.) ])) self.assertEqual(metric.result(), 5.0)
def _step(self, actions): """Progress the agent one step in the environment.""" actions = actions.flatten() self.states[:, self.partial_seq_len, -1] = 0 self.states[np.arange(self._batch_size), self.partial_seq_len, actions] = 1 self.partial_seq_len += 1 # We have not generated the last residue in the sequence, so continue if self.partial_seq_len < self.seq_length - 1: return nest_utils.stack_nested_arrays( [ts.transition(seq_state, 0) for seq_state in self.states]) # If sequence is of full length, score the sequence and end the episode # We need to take off the column in the matrix (-1) representing the mask token complete_sequences = [ s_utils.one_hot_to_string(seq_state[:, :-1], self.alphabet) for seq_state in self.states ] if self.fitness_model_is_gt: fitnesses = self.landscape.get_fitness(complete_sequences) else: fitnesses = self.model.get_fitness(complete_sequences) self.all_seqs.update(zip(complete_sequences, fitnesses)) # Reward = fitness - lambda * sequence density rewards = np.array([ f - self.lam * self.sequence_density(seq) for seq, f in zip(complete_sequences, fitnesses) ]) return nest_utils.stack_nested_arrays([ ts.termination(seq_state, r) for seq_state, r in zip(self.states, rewards) ])
def get_single(idx): """Gets the idx item from the replay buffer.""" with self._lock: if self._np_state.size <= idx: def empty_item(spec): return np.empty(spec.shape, dtype=spec.dtype) item = [ tf.nest.map_structure(empty_item, self.data_spec) for n in range(num_steps_value) ] item = nest_utils.stack_nested_arrays(item) return item if self._np_state.size == self._capacity: # If the buffer is full, add cur_id (head of circular buffer) so that # we sample from the range [cur_id, cur_id + size - num_steps_value]. # We will modulo the size below. idx += self._np_state.cur_id item = [ self._decode(self._storage.get((idx + n) % self._capacity)) for n in range(num_steps_value) ] item = nest_utils.stack_nested_arrays(item) return item
def get_episode(self, batch_size: Optional[int] = None, truncate_episode_at: Optional[int] = None) -> Tuple[ EnvStep, np.ndarray]: if batch_size is None: episode = self._get_episode(truncate_episode_at) mask = np.ones((len(episode),)) return nest_utils.stack_nested_arrays(episode), mask if batch_size <= 0: raise ValueError('Invalid batch size %s.' % batch_size) episodes = [] episode_lengths = [] for _ in range(batch_size): next_episode = self._get_episode(truncate_episode_at) episodes.append(next_episode) episode_lengths.append(len(next_episode)) max_length = max(episode_lengths) for episode in episodes: episode.extend([episode[-1]] * (max_length - len(episode))) batched_episodes = nest_utils.stack_nested_arrays( [nest_utils.stack_nested_arrays(episode) for episode in episodes]) mask = np.arange(max_length)[None, :] < np.array(episode_lengths)[:, None] return batched_episodes, mask
def _add_history(self, time_step, action): self._observation_history.append(time_step.observation) self._action_history.append(action) if self._include_actions: observation = { 'observation': nest_utils.stack_nested_arrays(self._observation_history), 'action': nest_utils.stack_nested_arrays(self._action_history) } else: observation = nest_utils.stack_nested_arrays(self._observation_history) return time_step._replace(observation=observation)
def _step(self, actions): """Forward a batch of actions to the wrapped environments. Args: actions: Batched action, possibly nested, to apply to the environment. Raises: ValueError: Invalid actions. Returns: Batch of observations, rewards, and done flags. """ if self._num_envs == 1: actions = nest_utils.unbatch_nested_array(actions) time_steps = self._envs[0].step(actions) return nest_utils.batch_nested_array(time_steps) else: unstacked_actions = unstack_actions(actions) if len(unstacked_actions) != self.batch_size: raise ValueError( "Primary dimension of action items does not match " "batch size: %d vs. %d" % (len(unstacked_actions), self.batch_size)) time_steps = self._execute( lambda env_action: env_action[0].step(env_action[1]), zip(self._envs, unstacked_actions)) return nest_utils.stack_nested_arrays(time_steps)
def render(self, mode="rgb_array") -> Optional[types.NestedArray]: if self._num_envs == 1: img = self._envs[0].render(mode) return nest_utils.batch_nested_array(img) else: imgs = self._execute(lambda env: env.render(mode), self._envs) return nest_utils.stack_nested_arrays(imgs)
def reset_random(self): if self._num_envs == 1: return nest_utils.batch_nested_array(self._envs[0].reset_random()) else: time_steps = self._execute(lambda env: env.reset_random(), self._envs) return nest_utils.stack_nested_arrays(time_steps)
def _get_next(self, sample_batch_size=None, num_steps=None, time_stacked=True): num_steps_value = num_steps if num_steps is not None else 1 def get_single(): """Gets a single item from the replay buffer.""" with self._lock: if self._np_state.size <= 0: raise ValueError('Read error: empty replay buffer') idx = np.random.randint(self._np_state.size - num_steps_value + 1) if self._np_state.size == self._capacity: # If the buffer is full, add cur_id (head of circular buffer) so that # we sample from the range [cur_id, cur_id + size - num_steps_value]. # We will modulo the size below. idx += self._np_state.cur_id if num_steps is not None: # TODO(b/120242830): Try getting data from numpy in one shot rather # than num_steps_value. item = [self._decode(self._storage.get((idx + n) % self._capacity)) for n in range(num_steps)] else: item = self._decode(self._storage.get(idx % self._capacity)) if num_steps is not None and time_stacked: item = nest_utils.stack_nested_arrays(item) return item if sample_batch_size is None: return get_single() else: samples = [get_single() for _ in range(sample_batch_size)] return nest_utils.stack_nested_arrays(samples)
def _get_initial_state(self, batch_size: int) -> types.NestedArray: if self._num_policies == 1: return nest_utils.batch_nested_array( self._policies[0].get_initial_state()) else: infos = self._execute(_execute_get_initial_state, self._policies) infos = nest_utils.unbatch_nested_array(infos) return nest_utils.stack_nested_arrays(infos)
def _gather_all(self): data = [ self._decode(self._storage.get(idx)) for idx in range(self._capacity) ] stacked = nest_utils.stack_nested_arrays(data) batched = tf.nest.map_structure(lambda t: np.expand_dims(t, 0), stacked) return batched
def generator_fn(): while True: if sample_batch_size is not None: batch = [self._get_next(num_steps=num_steps, time_stacked=False) for _ in range(sample_batch_size)] item = nest_utils.stack_nested_arrays(batch) else: item = self._get_next(num_steps=num_steps, time_stacked=False) yield tuple(tf.nest.flatten(item))
def get_single(): """Gets a single item from the replay buffer.""" if num_steps is not None and time_stacked: item = [self.load_item_from_bigtable() for _ in range(num_steps)] item = nest_utils.stack_nested_arrays(item) else: item = self.load_item_from_bigtable() buffer_info = BufferInfo(ids=0, probabilities=0) return item, buffer_info
def _reset(self): """Reset all environments and combine the resulting observation. Returns: Time step with batch dimension. """ if self._num_envs == 1: return nest_utils.batch_nested_array(self._envs[0].reset()) else: time_steps = self._execute(lambda env: env.reset(), self._envs) return nest_utils.stack_nested_arrays(time_steps)
def testStackNestedArrays(self): shape = (5, 8) batch_size = 3 batched_shape = (batch_size,) + shape specs = self.nest_spec(shape) unstacked_arrays = [self.zeros_from_spec(specs) for _ in range(batch_size)] stacked_array = nest_utils.stack_nested_arrays(unstacked_arrays) tf.nest.assert_same_structure(specs, stacked_array) assert_shapes = lambda a: self.assertEqual(a.shape, batched_shape) tf.nest.map_structure(assert_shapes, stacked_array)
def generator_fn(): while True: if sample_batch_size is not None: batch = [self.get_next(num_steps=num_steps_value, time_stacked=False, prioritized_buffer_beta=prioritized_buffer_beta) for _ in range(sample_batch_size)] item, item_idx, item_weight = nest_utils.stack_nested_arrays(batch) else: item, item_idx, item_weight = self.get_next(num_steps=num_steps_value, time_stacked=False, prioritized_buffer_beta=prioritized_buffer_beta) yield {"experiences": tuple(tf.nest.flatten(item)), "indices": item_idx, "weights": item_weight}
def testGetOuterArrayShape(self): spec = ( array_spec.ArraySpec([5, 8], np.float32), (array_spec.ArraySpec([1], np.int32), array_spec.ArraySpec([2, 2, 2], np.float32)) ) batch_size = 3 unstacked_arrays = [self.zeros_from_spec(spec) for _ in range(batch_size)] outer_dims = nest_utils.get_outer_array_shape(unstacked_arrays[0], spec) self.assertEqual((), outer_dims) stacked_array = nest_utils.stack_nested_arrays(unstacked_arrays) outer_dims = nest_utils.get_outer_array_shape(stacked_array, spec) self.assertEqual((batch_size,), outer_dims) time_dim = [nest_utils.batch_nested_array(arr) for arr in unstacked_arrays] batch_time = nest_utils.stack_nested_arrays(time_dim) outer_dims = nest_utils.get_outer_array_shape(batch_time, spec) self.assertEqual((batch_size, 1), outer_dims)
def setUp(self): super(BatchedPyMetricTest, self).setUp() # Order of args for trajectory methods: # (observation, action, policy_info, reward, discount) self._ts0 = nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) self._ts1 = nest_utils.stack_nested_arrays([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 2., 1.) ]) self._ts2 = nest_utils.stack_nested_arrays([ trajectory.last((), (), (), 3., 1.), trajectory.last((), (), (), 4., 1.) ]) self._ts3 = nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) self._ts4 = nest_utils.stack_nested_arrays([ trajectory.first((), (), (), 5., 1.), trajectory.first((), (), (), 6., 1.) ]) self._ts5 = nest_utils.stack_nested_arrays([ trajectory.last((), (), (), 7., 1.), trajectory.last((), (), (), 8., 1.) ])
def _action(self, time_step: ts.TimeStep, policy_state: types.NestedArray, seed: Optional[types.Seed] = None) -> ps.PolicyStep: """Forward a batch of time_step and policy_states to the wrapped policies. Args: time_step: A `TimeStep` tuple corresponding to `time_step_spec()`. policy_state: An Array, or a nested dict, list or tuple of Arrays representing the previous policy_state. seed: Seed value used to initialize a pseudorandom number generator. Returns: A batch of `PolicyStep` named tuples, each one containing: `action`: A nest of action Arrays matching the `action_spec()`. `state`: A nest of policy states to be fed into the next call to action. `info`: Optional side information such as action log probabilities. Raises: NotImplementedError: if `seed` is not None. """ if seed is not None: raise NotImplementedError( "seed is not supported; but saw seed: {}".format(seed)) if self._num_policies == 1: time_step = nest_utils.unbatch_nested_array(time_step) policy_state = nest_utils.unbatch_nested_array(policy_state) policy_steps = self._policies[0].action(time_step, policy_state) return nest_utils.batch_nested_array(policy_steps) else: unstacked_time_steps = nest_utils.unstack_nested_arrays(time_step) if len(unstacked_time_steps) != len(self._policies): raise ValueError( "Primary dimension of time_step items does not match " "batch size: %d vs. %d" % (len(unstacked_time_steps), len(self._policies))) unstacked_policy_states = [()] * len(unstacked_time_steps) if policy_state: unstacked_policy_states = nest_utils.unstack_nested_arrays( policy_state) if len(unstacked_policy_states) != len(self._policies): raise ValueError( "Primary dimension of policy_state items does not match " "batch size: %d vs. %d" % (len(unstacked_policy_states), len(self._policies))) policy_steps = self._execute( _execute_policy, zip(self._policies, unstacked_time_steps, unstacked_policy_states)) return nest_utils.stack_nested_arrays(policy_steps)
def get_step(self, batch_size: Optional[int] = None, num_steps: Optional[int] = None) -> EnvStep: if batch_size is not None: raise ValueError('This dataset does not support batched step sampling.') if num_steps is None: return self._get_step() env_steps = [] for _ in range(num_steps): next_step = self._get_step() env_steps.append(next_step) return nest_utils.stack_nested_arrays(env_steps)
def step_adversary(self, actions): if self._num_envs == 1: actions = nest_utils.unbatch_nested_array(actions) time_steps = self._envs[0].step_adversary(actions) return nest_utils.batch_nested_array(time_steps) else: unstacked_actions = batched_py_environment.unstack_actions(actions) if len(unstacked_actions) != self.batch_size: raise ValueError( 'Primary dimension of action items does not match ' 'batch size: %d vs. %d' % (len(unstacked_actions), self.batch_size)) time_steps = self._execute( lambda env_action: env_action[0].step_adversary(env_action[1]), zip(self._envs, unstacked_actions)) return nest_utils.stack_nested_arrays(time_steps)
def _get_next(self, sample_batch_size=None, num_steps=None, time_stacked=True): num_steps_value = num_steps if num_steps is not None else 1 def get_single(): """Gets a single item from the replay buffer.""" if num_steps is not None and time_stacked: item = [self.load_item_from_bigtable() for _ in range(num_steps)] item = nest_utils.stack_nested_arrays(item) else: item = self.load_item_from_bigtable() buffer_info = BufferInfo(ids=0, probabilities=0) return item, buffer_info if sample_batch_size is None: return get_single() else: samples = [get_single() for _ in range(sample_batch_size)] return nest_utils.stack_nested_arrays(samples)
def render(self, mode: Text = 'rgb_array') -> types.NestedArray: """Renders the environment. Args: mode: Rendering mode. Currently only 'rgb_array' is supported because this is a batched environment. Returns: An ndarray of shape [batch_size, width, height, 3] denoting RGB images (for mode=`rgb_array`). Raises: NotImplementedError: If the environment does not support rendering, or any other mode than `rgb_array` is given. """ if mode != 'rgb_array': raise NotImplementedError('Only rgb_array rendering mode is supported. ' 'Got %s' % mode) imgs = [env.render(mode, blocking=self._blocking) for env in self._envs] if not self._blocking: imgs = [promise() for promise in imgs] return nest_utils.stack_nested_arrays(imgs)
def _generate_batch_of_observations(self, generator_fn, num_samples): unstacked_obs = [generator_fn() for _ in range(num_samples)] return nest_utils.stack_nested_arrays(unstacked_obs)
def get_next(self, sample_batch_size=None, prioritized_buffer_beta=0.4, num_steps=None, time_stacked=True): """ Build next batch of experiences while computing the importance sampling weights of the selected experiences Params: sample_batch_size (int): batch size beta (float): ratio to use when computing the importance sampling weights num_steps (int): number of steps to load. Only 1 is supported at the moment time_stacked (bool): whether the timesteps are stacked or not. Returns: (Trajectory): mini batch of experiences (int list): the indices of the selected experiences in the replay buffer (float list): importance sampling weights to use when training using the experiences. """ num_steps_value = num_steps if num_steps is not None else 1 if num_steps_value != 1: raise NotImplementedError('PyPrioritizedReplayBuffer only supports a batches with num_step ' 'size of 1, but received batch with num_steps' 'size {}.'.format(num_steps_value)) def get_single_experience(b): """Gets a single experience from the replay buffer.""" with self._lock: # return empty items if the buffer is empty if self._np_state.size <= 0: def empty_item(spec): return np.empty(spec.shape, dtype=spec.dtype) item = tf.nest.map_structure(empty_item, self.data_spec) selected_idx = -1 selected_weight = -1 return item, selected_idx, selected_weight # select index based on the priorities of the experiences in the buffer selected_idx, selected_weight = self.select_prioritized_experience(b) # get item item = self._decode(self._storage.get(selected_idx % self._prioritized_buffer_capacity)) return item, selected_idx, selected_weight if sample_batch_size is None: return get_single_experience(prioritized_buffer_beta) else: experiences = [] buffer_indices = [] importance_sampling_weights = [] for _ in range(sample_batch_size): experience, idx, weight = get_single_experience(prioritized_buffer_beta) experiences.append(experience) buffer_indices.append(idx) importance_sampling_weights.append(weight) buffer_indices = np.array(buffer_indices) importance_sampling_weights = np.array(importance_sampling_weights, dtype=np.float32) # normalize weight importance_sampling_weights = np.divide(importance_sampling_weights, importance_sampling_weights.max()) trajectory = nest_utils.stack_nested_arrays(experiences) return trajectory, buffer_indices, importance_sampling_weights
def get_info(self) -> types.NestedArray: if self._num_envs == 1: return nest_utils.batch_nested_array(self._envs[0].get_info()) else: infos = self._execute(lambda env: env.get_info(), self._envs) return nest_utils.stack_nested_arrays(infos)
def _reset(self): self.partial_seq_len = 0 self.states[:, :, :] = 0 self.states[:, np.arange(self.seq_length), -1] = 1 return nest_utils.stack_nested_arrays( [ts.restart(seq_state) for seq_state in self.states])