def test_clear(self): batch_size = 1 spec = self._data_spec() replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=batch_size, max_length=1) self.evaluate(tf.compat.v1.global_variables_initializer()) initial_id = self.evaluate(replay_buffer._get_last_id()) empty_items = self.evaluate(replay_buffer.gather_all()) values, _ = self.evaluate(_get_add_op(spec, replay_buffer, batch_size)) sample, _ = self.evaluate(replay_buffer.get_next(sample_batch_size=3)) tf.nest.map_structure(lambda x, y: self._assert_contains([x], list(y)), values, sample) self.assertNotEqual(initial_id, self.evaluate(replay_buffer._get_last_id())) self.evaluate(replay_buffer.clear()) self.assertEqual(initial_id, self.evaluate(replay_buffer._get_last_id())) def check_np_arrays_everything_equal(x, y): np.testing.assert_equal(x, y) self.assertEqual(x.dtype, y.dtype) tf.nest.map_structure(check_np_arrays_everything_equal, empty_items, self.evaluate(replay_buffer.gather_all()))
def test_gather_all_empty_batch_five(self): batch_size = 5 spec = specs.TensorSpec([], tf.int32, 'action') replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=batch_size) items = replay_buffer.gather_all() expected = [[]] * batch_size self.evaluate(tf.compat.v1.global_variables_initializer()) items_ = self.evaluate(items) self.assertAllClose(expected, items_)
def test_get_next_empty(self): spec = self._data_spec() replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=1, max_length=1) with self.assertRaisesRegexp( tf.errors.InvalidArgumentError, 'TFUniformReplayBuffer is empty. Make ' 'sure to add items before sampling the buffer.'): self.evaluate(tf.compat.v1.global_variables_initializer()) sample, _ = replay_buffer.get_next() self.evaluate(sample)
def test_add_single_sample_batch(self): batch_size = 1 spec = self._data_spec() replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=batch_size, max_length=1) values, add_op = _get_add_op(spec, replay_buffer, batch_size) sample, _ = replay_buffer.get_next(sample_batch_size=3) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(add_op) values_ = self.evaluate(values) sample_ = self.evaluate(sample) tf.nest.map_structure(lambda x, y: self._assert_contains([x], list(y)), values_, sample_)
def test_add_batch_five(self): batch_size = 5 spec = self._data_spec() replay_buffer = TfPrioritizedReplayBuffer( spec, batch_size=batch_size, max_length=1, scope='rb{}'.format(batch_size)) values, add_op = _get_add_op(spec, replay_buffer, batch_size) sample, _ = replay_buffer.get_next() self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(add_op) sample_ = self.evaluate(sample) values_ = self.evaluate(values) tf.nest.map_structure(self.assertAllClose, values_, sample_)
def test_gather_all_batch_five(self): batch_size = 5 spec = specs.TensorSpec([], tf.int64, 'action') replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=batch_size) @common.function(autograph=True) def add_data(): batch = tf.range(0, batch_size, 1, dtype=tf.int64) replay_buffer.add_batch(batch) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(add_data()) items = replay_buffer.gather_all() expected = [list(range(i, i + 1)) for i in range(0, batch_size)] items_ = self.evaluate(items) self.assertAllClose(expected, items_)
def _init_replay_buffer(self, batch_size, traj_spec): buffer_config = { "batch_size": batch_size, "data_spec": traj_spec, "max_length": 1, "alpha": self._alpha } tf.compat.v2.summary.scalar(name="replay_buffer_size", data=batch_size) self._replay_buffer = TFReplayBuffer(**buffer_config)
def test_sample_batch_correct_probabilities_batch_ten_as_dataset(self): buffer_batch_size = 10 alpha = 0.6 spec = specs.TensorSpec([], tf.int32, 'action') replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=buffer_batch_size, max_length=1, alpha=alpha) experience = [] experience_shape = (1, ) for k in range(buffer_batch_size): experience.append(np.full(experience_shape, k, dtype=np.int32)) tf_experience = tf.convert_to_tensor(experience) replay_buffer.add_batch(tf_experience) sample_batch_size = 2 self.evaluate(tf.compat.v1.global_variables_initializer()) beta = 0.6 ds = replay_buffer.as_dataset(sample_batch_size=sample_batch_size, beta=beta) itr = iter(ds) def next_iter(): return next(itr) sample = next_iter selected_probabilities = 1**alpha / ((1**alpha) * buffer_batch_size) expected_weights = [ (buffer_batch_size * selected_probabilities)**(-beta) for _ in range(sample_batch_size) ] res = self.evaluate(sample) weights = res[1].probabilities self.assertAllClose(expected_weights, weights)
def test_sample_batch_Correct_probabilities_batch_ten(self): buffer_batch_size = 10 alpha = 0.6 spec = specs.TensorSpec([], tf.int32, 'action') replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=buffer_batch_size, max_length=1, alpha=alpha) experience = [] experience_shape = (1, ) for k in range(buffer_batch_size): experience.append(np.full(experience_shape, k, dtype=np.int32)) tf_experience = tf.convert_to_tensor(experience) replay_buffer.add_batch(tf_experience) sample_batch_size = 2 @common.function def probabilities(): _, buffer_info = replay_buffer.get_next( sample_batch_size=sample_batch_size) return buffer_info.probabilities self.evaluate(tf.compat.v1.global_variables_initializer()) beta = 0.6 selected_probabilities = 1**alpha / ((1**alpha) * buffer_batch_size) expected_weights = [ (buffer_batch_size * selected_probabilities)**(-beta) for _ in range(sample_batch_size) ] weights = self.evaluate(probabilities()) self.assertAllClose(expected_weights, weights)
class TFPrioritizedReplayBuffer(TFReplayBufferAbstract): _curr_beta = None def __init__(self, collect_data_spec, alpha, beta, training_iterations_num): """ Store replay buffer params Params: collect_data_spec: spec of the data to be added to the buffer alpha: This param is used to determine how much emphasis is given to the priority beta: training_iterations_num: """ super().__init__(collect_data_spec) self._beta = beta self._alpha = alpha self._training_iterations_total_num = training_iterations_num self._metric_tracker = None def _init_replay_buffer(self, batch_size, traj_spec): buffer_config = { "batch_size": batch_size, "data_spec": traj_spec, "max_length": 1, "alpha": self._alpha } tf.compat.v2.summary.scalar(name="replay_buffer_size", data=batch_size) self._replay_buffer = TFReplayBuffer(**buffer_config) def add_batch(self, traj_dict): """ add a trajectory to the replay buffer Params traj (dict[dim]:numpy): a dict of tensors representing the trajectory to be added it to the replay buffer """ collect_spec_dict = self.collect_data_spec._asdict() traj_tf, traj_spec = build_tf_trajectory(traj_dict, collect_spec_dict) if not self._replay_buffer: batch_size = len(traj_dict["observation"]) self._init_replay_buffer(batch_size, traj_spec) self._replay_buffer.add_batch(traj_tf) def get_batch(self, batch_size): traj, metadata = self._replay_buffer.get_next(sample_batch_size=batch_size, beta=self._curr_beta) self._metric_tracker.add_batch_weights(metadata.probabilities) self._metric_tracker.add_batch_indices(metadata.ids) return traj, metadata def pre_process(self, curr_iter): if not self._metric_tracker: self._metric_tracker = TrainingMetricTracker(self._training_iterations_total_num) # compute the beta that will be used when computing the importance sampling weights self._curr_beta = self._replay_buffer.compute_beta(self._beta, curr_iter, self._training_iterations_total_num) # add important data to the metric tracker self._metric_tracker.latest_iteration = curr_iter self._metric_tracker.latest_beta = self._curr_beta def post_process(self, traj_meta, loss_info, curr_iter): indices = traj_meta.ids.numpy() # get the loss of every experience using during the training. it is stored in DQNLossInfo td_loss = loss_info[1].td_loss.numpy() # make sure the td loss array has the same size as the batch if td_loss.shape != indices.shape: raise Exception("Expected the shape of the loss '%s' to be the same as the shape of the " "indices '%s'" % (str(td_loss.shape), len(indices.shape))) # update the prioritized replay buffer self._replay_buffer.update_priorities(indices, td_loss) self._metric_tracker.log_partial_metrics() self._metric_tracker.latest_loss_info = loss_info if curr_iter == self._training_iterations_total_num: self._metric_tracker.log_summary_metrics()
def test_prioritized_replay_buffer_as_dataset(self): np.random.seed(123) buffer_batch_size = 10 alpha = 0.6 spec = specs.TensorSpec([], tf.int32, 'action') replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=buffer_batch_size, max_length=1, alpha=alpha) # make sure that the priority are set to 0 since the buffer is empty expected_priority = np.zeros((buffer_batch_size, ), dtype=np.float32) for i in range(buffer_batch_size): if i >= buffer_batch_size: break expected_priority[i] = 1.0 experience = [] experience_shape = (1, ) for k in range(buffer_batch_size): experience.append(np.full(experience_shape, k, dtype=np.int32)) tf_experience = tf.convert_to_tensor(experience) replay_buffer.add_batch(tf_experience) sample_batch_size = 10 beta = 0.4 sample_frequency = [0 for _ in range(10)] for i in range(15 * 3): ds = replay_buffer.as_dataset(sample_batch_size=sample_batch_size, beta=beta) itr = iter(ds) for j in range(int(100 / 3)): mini_batch, metadata = next(itr) indices_tf = metadata.ids # indices = self.evaluate(indices_tf) indices = indices_tf.numpy() if i % 100 == 0: self.validate_data(mini_batch, indices) for idx in indices: sample_frequency[idx] += 1 # set the loss of numbers larger 5 to be equal to their number # set the loss of numbers smaller or equal to 5 close to 0 priorities = [i if i > 5 else i / 10 for i in indices] replay_buffer.update_priorities(indices, priorities) for i in range(10): if i <= 5: # numbers smaller than 5 should be picked less that 1% of the time self.assertLessEqual(sample_frequency[i], 15000 * 5 / 100) else: # all numbers larger than 5 should be picked between 15% and 25% of the time self.assertGreaterEqual(sample_frequency[i], 15000 * 15 / 100) self.assertLessEqual(sample_frequency[i], 15000 * 30 / 100) # all numbers larger than 5 should be selected more times than the numbers which precedes them and # less time than the numbers that follows them self.assertGreaterEqual(sample_frequency[i], sample_frequency[i - 1]) if i < 9: self.assertLessEqual(sample_frequency[i], sample_frequency[i + 1]) indices = [i for i in range(10)] priorities = [1 for _ in range(10)] replay_buffer.update_priorities(indices, priorities) np.random.seed(12323423) # set the loss of numbers larger or equal 5 to be close to 0 # set the loss of numbers smaller to 5 to their number + 5 sample_frequency = [0 for _ in range(10)] for i in range(15 * 20): ds = replay_buffer.as_dataset(sample_batch_size=sample_batch_size, beta=beta) itr = iter(ds) for j in range(int(100 / 20)): mini_batch, metadata = next(itr) indices_tf = metadata.ids indices = indices_tf.numpy() if i % 100 == 0: self.validate_data(mini_batch, indices) for idx in indices: sample_frequency[idx] += 1 # set the loss of numbers larger 5 to be equal to their number # set the loss of numbers smaller or equal to 5 close to 0 priorities = [i / 10 if i >= 5 else i + 5 for i in indices] replay_buffer.update_priorities(indices, priorities) for i in range(10): if i >= 5: # numbers larger than 5 should be picked less that 1% of the time self.assertLessEqual(sample_frequency[i], 15000 * 5 / 100) else: # all numbers smaller or equal to 5 should be picked between 12% and 20% of the time self.assertGreaterEqual(sample_frequency[i], 15000 * 10 / 100) self.assertLessEqual(sample_frequency[i], 15000 * 25 / 100) # all numbers smaller or equal to 5 should be selected more times than the numbers which precedes # them and less time than the numbers that follows them self.assertGreaterEqual(sample_frequency[i], sample_frequency[i - 1]) if i < 4: self.assertLessEqual(sample_frequency[i], sample_frequency[i + 1])
def test_clear_all_variables(self): batch_size = 1 spec = self._data_spec() replay_buffer = TfPrioritizedReplayBuffer(spec, batch_size=batch_size, max_length=1) action = tf.constant( 1 * np.ones(spec[0].shape.as_list(), dtype=np.float32)) lidar = tf.constant( 2 * np.ones(spec[1][0].shape.as_list(), dtype=np.float32)) camera = tf.constant( 3 * np.ones(spec[1][1].shape.as_list(), dtype=np.float32)) values = [action, [lidar, camera]] values_batched = tf.nest.map_structure( lambda t: tf.stack([t] * batch_size), values) if tf.executing_eagerly(): def add_batch(): return replay_buffer.add_batch(values_batched) add_op = add_batch else: add_op = replay_buffer.add_batch(values_batched) def get_table_vars(): return [ var for var in replay_buffer.variables() if 'Table' in var.name ] self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(replay_buffer._clear(clear_all_variables=True)) empty_table_vars = self.evaluate(get_table_vars()) initial_id = self.evaluate(replay_buffer._get_last_id()) empty_items = self.evaluate(replay_buffer.gather_all()) self.evaluate(add_op) self.evaluate(add_op) self.evaluate(add_op) self.evaluate(add_op) values_ = self.evaluate(values) sample, _ = self.evaluate(replay_buffer.get_next(sample_batch_size=3)) tf.nest.map_structure(lambda x, y: self._assert_contains([x], list(y)), values_, sample) self.assertNotEqual(initial_id, self.evaluate(replay_buffer._get_last_id())) tf.nest.map_structure(lambda x, y: self.assertFalse(np.all(x == y)), empty_table_vars, self.evaluate(get_table_vars())) self.evaluate(replay_buffer._clear(clear_all_variables=True)) self.assertEqual(initial_id, self.evaluate(replay_buffer._get_last_id())) def check_np_arrays_everything_equal(x, y): np.testing.assert_equal(x, y) self.assertEqual(x.dtype, y.dtype) tf.nest.map_structure(check_np_arrays_everything_equal, empty_items, self.evaluate(replay_buffer.gather_all()))