def testBatchedEnvironment(self, max_steps, max_episodes, expected_length): expected_trajectories = [ trajectory.Trajectory( step_type=np.array([0, 0]), observation=np.array([0, 0]), action=np.array([2, 1]), policy_info=np.array([4, 2]), next_step_type=np.array([1, 1]), reward=np.array([1., 1.]), discount=np.array([1., 1.])), trajectory.Trajectory( step_type=np.array([1, 1]), observation=np.array([2, 1]), action=np.array([1, 2]), policy_info=np.array([2, 4]), next_step_type=np.array([2, 1]), reward=np.array([1., 1.]), discount=np.array([0., 1.])), trajectory.Trajectory( step_type=np.array([2, 1]), observation=np.array([3, 3]), action=np.array([2, 1]), policy_info=np.array([4, 2]), next_step_type=np.array([0, 2]), reward=np.array([0., 1.]), discount=np.array([1., 0.])) ] env1 = driver_test_utils.PyEnvironmentMock(final_state=3) env2 = driver_test_utils.PyEnvironmentMock(final_state=4) env = batched_py_environment.BatchedPyEnvironment([env1, env2]) tf_env = tf_py_environment.TFPyEnvironment(env) policy = driver_test_utils.TFPolicyMock( tf_env.time_step_spec(), tf_env.action_spec(), batch_size=2, initial_policy_state=tf.constant([1, 2], dtype=tf.int32)) replay_buffer_observer = MockReplayBufferObserver() driver = tf_driver.TFDriver( tf_env, policy, observers=[replay_buffer_observer], max_steps=max_steps, max_episodes=max_episodes, ) initial_time_step = tf_env.reset() initial_policy_state = tf.constant([1, 2], dtype=tf.int32) self.evaluate(driver.run(initial_time_step, initial_policy_state)) trajectories = replay_buffer_observer.gather_all() self.assertEqual( len(trajectories), len(expected_trajectories[:expected_length])) for t1, t2 in zip(trajectories, expected_trajectories[:expected_length]): for t1_field, t2_field in zip(t1, t2): self.assertAllEqual(t1_field, t2_field)
def test_collect_data_spec_trajectory(self): episode_dict = { 'states': np.array([[1., 2.], [3., 4.], [5., 6.], [7., 8.]], dtype=np.float32), 'actions': np.array([[1.], [2.], [3.], [4.]], dtype=np.float32), 'rewards': np.array([[0.], [1.], [0.], [1.]], dtype=np.float32), 'discounts': np.array([1.0, 0.0, 1.0, 0.0], dtype=np.float32), 'episode_start_index': np.array([0, 2], dtype=np.int32) } expected_spec = trajectory.Trajectory( step_type=ArraySpec(shape=[], dtype=np.int32), observation=ArraySpec(shape=[2], dtype=np.float32), action=ArraySpec(shape=[1], dtype=np.float32), policy_info=(), next_step_type=ArraySpec(shape=[], dtype=np.int32), reward=ArraySpec(shape=[1], dtype=np.float32), discount=ArraySpec(shape=[], dtype=np.float32)) actual_spec = create_collect_data_spec(episode_dict, use_trajectories=True) self.assertEqual(actual_spec, expected_spec)
def testAverageReturnMultiMetricTimeMisalignment( self, run_mode, num_trajectories, reward_spec, expected_result): with run_mode(): trajectories = self._create_misaligned_trajectories() multi_trajectories = [] for traj in trajectories: if isinstance(reward_spec, list): new_reward = [traj.reward, traj.reward] else: new_reward = tf.stack([traj.reward, traj.reward], axis=1) new_traj = trajectory.Trajectory( step_type=traj.step_type, observation=traj.observation, action=traj.action, policy_info=traj.policy_info, next_step_type=traj.next_step_type, reward=new_reward, discount=traj.discount) multi_trajectories.append(new_traj) metric = tf_metrics.AverageReturnMultiMetric(reward_spec, batch_size=2) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(metric.init_variables()) for i in range(num_trajectories): self.evaluate(metric(multi_trajectories[i])) self.assertAllEqual(expected_result, self.evaluate(metric.result())) self.evaluate(metric.reset()) self.assertAllEqual([0.0, 0.0], self.evaluate(metric.result()))
def testToNStepTransitionForNEquals1(self): first = ts.StepType.FIRST last = ts.StepType.LAST # Define a batch size 1, 2-step trajectory. traj = trajectory.Trajectory( step_type=np.array([[first, last]]), next_step_type=np.array([[last, first]]), observation=np.array([[10.0, 20.0]]), action=np.array([[11.0, 22.0]]), # reward & discount values at step 1 is an invalid dummy reward. reward=np.array([[-1.0, 0.0]]), discount=np.array([[0.9, 0.0]]), policy_info=np.array([[10.0, 20.0]])) transition = trajectory.to_n_step_transition(traj, gamma=0.5) self.assertIsInstance(transition, trajectory.Transition) time_steps, policy_steps, next_time_steps = transition self.assertAllEqual(time_steps.step_type, np.array([first])) self.assertAllEqual(time_steps.observation, np.array([10.0])) self.assertAllEqual(time_steps.reward, np.array([np.nan])) self.assertAllEqual(time_steps.discount, np.array([np.nan])) self.assertAllEqual(next_time_steps.step_type, np.array([last])) self.assertAllEqual(next_time_steps.observation, np.array([20.0])) # r0 self.assertAllEqual(next_time_steps.reward, np.array([-1.0])) # d0 self.assertAllEqual(next_time_steps.discount, np.array([0.9])) self.assertAllEqual(policy_steps.action, np.array([11.0])) self.assertAllEqual(policy_steps.info, np.array([10.0]))
def testAgentTrajectoryTrain(self): agent = td3_agent.Td3Agent( self._time_step_spec, self._action_spec, critic_network=self._critic_net, actor_network=self._bounded_actor_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), ) trajectory_spec = trajectory.Trajectory( step_type=self._time_step_spec.step_type, observation=self._time_step_spec.observation, action=self._action_spec, policy_info=(), next_step_type=self._time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec([], tf.float32, minimum=0.0, maximum=1.0, name='reward'), discount=self._time_step_spec.discount) sample_trajectory_experience = tensor_spec.sample_spec_nest( trajectory_spec, outer_dims=(3, 2)) agent.train(sample_trajectory_experience)
def setUp(self): super(ReverbReplayBufferTest, self).setUp() # Prepare the environment (and the corresponding specs). self._env = test_envs.EpisodeCountingEnv(steps_per_episode=3) tensor_time_step_spec = tf.nest.map_structure(tensor_spec.from_spec, self._env.time_step_spec()) tensor_action_spec = tensor_spec.from_spec(self._env.action_spec()) self._data_spec = trajectory.Trajectory( step_type=tensor_time_step_spec.step_type, observation=tensor_time_step_spec.observation, action=tensor_action_spec, policy_info=(), next_step_type=tensor_time_step_spec.step_type, reward=tensor_time_step_spec.reward, discount=tensor_time_step_spec.discount, ) table_spec = tf.nest.map_structure( lambda s: tf.TensorSpec(dtype=s.dtype, shape=(None,) + s.shape), self._data_spec) self._array_data_spec = tensor_spec.to_nest_array_spec(self._data_spec) # Initialize and start a Reverb server (and set up a client to it). self._table_name = 'test_table' uniform_table = reverb.Table( self._table_name, max_size=100, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1), signature=table_spec, ) self._server = reverb.Server([uniform_table]) self._py_client = reverb.Client('localhost:{}'.format(self._server.port))
def testWithAdvantageFn(self, with_value_network): advantage_fn = mock.Mock( side_effect=lambda returns, _: returns) value_network = (DummyValueNet(self._obs_spec) if with_value_network else None) agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet( self._obs_spec, self._action_spec, unbounded_actions=False), value_network=value_network, advantage_fn=advantage_fn, optimizer=None, ) step_type = tf.constant([[ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST]]) next_step_type = tf.constant([[ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST]]) reward = tf.constant([[0, 0, 0, 0]], dtype=tf.float32) discount = tf.constant([[1, 1, 1, 1]], dtype=tf.float32) observations = tf.constant( [[[1, 2], [1, 2], [1, 2], [1, 2]]], dtype=tf.float32) actions = tf.constant([[[0], [1], [2], [3]]], dtype=tf.float32) experience = trajectory.Trajectory( step_type, observations, actions, (), next_step_type, reward, discount) agent.total_loss(experience, reward, None) advantage_fn.assert_called_once()
def _create_trajectories(n_time_steps, batch_size): # Observation looks like: # [[ 0., 1., ... n_time_steps.], # [10., 11., ... n_time_steps.], # [20., 21., ... n_time_steps.], # [ ... ], # [10*batch_size., ... 10*batch_size+n_time_steps.]] observation_array = np.asarray( [np.arange(n_time_steps) + 10 * i for i in range(batch_size)]) observations = tf.convert_to_tensor(observation_array, dtype=tf.float32) default_tensor = tf.constant([[1] * n_time_steps] * batch_size, dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep(step_type=tf.constant( [[mid_time_step_val] * n_time_steps] * batch_size, dtype=tf.int32), reward=default_tensor, discount=default_tensor, observation=observations) actions = tf.constant([[[1]] * n_time_steps] * batch_size, dtype=tf.float32) policy_info = { 'dist_params': { 'loc': tf.constant([[[1]] * n_time_steps] * batch_size, dtype=tf.float32), 'scale': tf.constant([[[1]] * n_time_steps] * batch_size, dtype=tf.float32) }, 'value_prediction': default_tensor, 'return': default_tensor, 'advantage': default_tensor, } return trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount)
def testProcessExperienceGlobalFeatures(self): observation_spec = { 'f1': tf.TensorSpec(shape=(5, ), dtype=tf.string), 'f2': tf.TensorSpec(shape=(5, 2), dtype=tf.int32) } time_step_spec = time_step.time_step_spec(observation_spec) training_data_spec = trajectory.Trajectory( step_type=time_step_spec.step_type, observation=time_step_spec.observation, action=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=4, dtype=tf.int32), policy_info=(), next_step_type=time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec(shape=(), minimum=0, maximum=2, dtype=tf.float32), discount=time_step_spec.discount) experience = tensor_spec.sample_spec_nest(training_data_spec, outer_dims=(7, 2)) observation, action, reward = utils.process_experience_for_neural_agents( experience, False, training_data_spec) self.assertAllEqual(observation['f1'][0], experience.observation['f1'][0, 0]) self.assertEqual(action[0], experience.action[0, 0]) self.assertEqual(reward[0], experience.reward[0, 0])
def testToTransition(self): first = ts.StepType.FIRST mid = ts.StepType.MID last = ts.StepType.LAST # Define a batch size 1, 3-step trajectory. traj = trajectory.Trajectory( step_type=np.array([[first, mid, last]]), next_step_type=np.array([[mid, last, first]]), observation=np.array([[10.0, 20.0, 30.0]]), action=np.array([[11.0, 22.0, 33.0]]), # reward at step 0 is an invalid dummy reward. reward=np.array([[0.0, 1.0, 2.0]]), discount=np.array([[1.0, 1.0, 0.0]]), policy_info=np.array([[1.0, 2.0, 3.0]])) time_steps, policy_steps, next_time_steps = trajectory.to_transition( traj) self.assertAllEqual(time_steps.step_type, np.array([[first, mid]])) self.assertAllEqual(time_steps.observation, np.array([[10.0, 20.0]])) self.assertAllEqual(next_time_steps.step_type, np.array([[mid, last]])) self.assertAllEqual(next_time_steps.observation, np.array([[20.0, 30.0]])) self.assertAllEqual(next_time_steps.reward, np.array([[0.0, 1.0]])) self.assertAllEqual(next_time_steps.discount, np.array([[1.0, 1.0]])) self.assertAllEqual(policy_steps.action, np.array([[11.0, 22.0]])) self.assertAllEqual(policy_steps.info, np.array([[1.0, 2.0]]))
def testAgentTrajectoryTrain(self): actor_net = actor_distribution_network.ActorDistributionNetwork( self._obs_spec, self._action_spec, fc_layer_params=(10, ), continuous_projection_net=tanh_normal_projection_network. TanhNormalProjectionNetwork) agent = sac_agent.SacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=actor_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001), alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001)) trajectory_spec = trajectory.Trajectory( step_type=self._time_step_spec.step_type, observation=self._time_step_spec.observation, action=self._action_spec, policy_info=(), next_step_type=self._time_step_spec.step_type, reward=tensor_spec.BoundedTensorSpec([], tf.float32, minimum=0.0, maximum=1.0, name='reward'), discount=self._time_step_spec.discount) sample_trajectory_experience = tensor_spec.sample_spec_nest( trajectory_spec, outer_dims=(3, 2)) agent.train(sample_trajectory_experience)
def testTrainMaskingRewardMultipleEpisodesRewardOnLast(self): # Test that train reacts correctly to experience when there are: # * Multiple MDP episodes # * Rewards on the tf.StepType.LAST transitions # # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below. # # Experience looks like this: # Trajectories: (F, L) -> (L, F) -> (F, L) -> (L, F) # observation : [1, 2] [1, 2] [1, 2] [1, 2] # action : [0] [1] [2] [3] # reward : 0 3 0 4 # ~is_boundary: 1 0 1 0 # is_last : 1 0 1 0 # valid reward: 0*1 3*0 0*1 4*0 # # The second & fourth action & reward should be masked out due to being on a # boundary (step_type=(L, F)) transition. # # The expected_loss is = 0.0 in this case. agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet(self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=tf.compat.v1.train.AdamOptimizer(0.001), use_advantage_loss=False, normalize_returns=False, ) step_type = tf.constant([ ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST ]) next_step_type = tf.constant([ ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST ]) reward = tf.constant([0, 3, 0, 4], dtype=tf.float32) discount = tf.constant([1, 0, 1, 0], dtype=tf.float32) observations = tf.constant([[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32) actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32) experience = nest_utils.batch_nested_tensors( trajectory.Trajectory(step_type, observations, actions, (), next_step_type, reward, discount)) # Rewards on the StepType.LAST should be counted. expected_loss = 0.0 if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss) self.assertAllClose(loss_info.loss, expected_loss)
def _create_experience(_): observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep(step_type=tf.constant( [[mid_time_step_val] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) policy_info = { 'dist_params': action_distribution_parameters, } policy_info['value_prediction'] = value_preds experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) return agent._preprocess(experience) # pylint: disable=protected-access
def testTrain(self, num_epochs, use_td_lambda_return): with tf.compat.v2.summary.record_if(False): # Mock the build_train_op to return an op for incrementing this counter. counter = common.create_variable('test_train_counter') agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet( self._obs_spec, self._action_spec, ), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, num_epochs=num_epochs, use_gae=use_td_lambda_return, use_td_lambda_return=use_td_lambda_return, train_step_counter=counter) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } policy_info = action_distribution_parameters experience = trajectory.Trajectory( time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) # Assert that counter starts out at zero. self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertEqual(0, self.evaluate(counter)) self.evaluate(loss) # Assert that train_op ran increment_counter num_epochs times. self.assertEqual(num_epochs, self.evaluate(counter))
def testTrainWithRnn(self): actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( self._obs_spec, self._action_spec, input_fc_layer_params=None, output_fc_layer_params=None, conv_layer_params=None, lstm_size=(40,), ) critic_net = critic_rnn_network.CriticRnnNetwork( (self._obs_spec, self._action_spec), observation_fc_layer_params=(16,), action_fc_layer_params=(16,), joint_fc_layer_params=(16,), lstm_size=(16,), output_fc_layer_params=None, ) counter = common.create_variable('test_train_counter') optimizer_fn = tf.compat.v1.train.AdamOptimizer agent = sac_agent.SacAgent( self._time_step_spec, self._action_spec, critic_network=critic_net, actor_network=actor_net, actor_optimizer=optimizer_fn(1e-3), critic_optimizer=optimizer_fn(1e-3), alpha_optimizer=optimizer_fn(1e-3), train_step_counter=counter, ) batch_size = 5 observations = tf.constant( [[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32) actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32) time_steps = ts.TimeStep( step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), observation=[observations]) experience = trajectory.Trajectory( time_steps.step_type, [observations], actions, (), time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertEqual(self.evaluate(counter), 0) self.evaluate(loss) self.assertEqual(self.evaluate(counter), 1)
def testTrainMaskingPartialEpisodeMultipleEpisodesRewardOnFirst(self): # Test that train reacts correctly to experience when there are: # * Multiple MDP episodes # * Rewards on the tf.StepType.FIRST transitions # * Partial episode at end of experience # # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below. # # Experience looks like this: # Trajectories: (F, L) -> (L, F) -> (F, M) -> (M, M) # observation : [1, 2] [1, 2] [1, 2] [1, 2] # action : [0] [1] [2] [3] # reward : 3 0 4 0 # ~is_boundary: 1 0 1 1 # is_last : 1 0 0 0 # valid reward: 3*1 0*0 4*0 0*0 # # The second action & reward should be masked out due to being on a # boundary (step_type=(L, F)) transition. The third & fourth transitions # should get masked out for everything due to it being an incomplete episode # (notice there is no trailing step_type=(F,L)). # # The expected_loss is > 0.0 in this case, matching the expected_loss of the # testMaskingRewardSingleEpisodeRewardOnFirst policy_gradient_loss test, # because the partial second episode should be masked out. agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet( self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=tf.compat.v1.train.AdamOptimizer(0.001), use_advantage_loss=False, normalize_returns=False, ) step_type = tf.constant([ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.MID]) next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.MID, ts.StepType.MID]) reward = tf.constant([3, 0, 4, 0], dtype=tf.float32) discount = tf.constant([1, 0, 1, 0], dtype=tf.float32) observations = tf.constant( [[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32) actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32) experience = nest_utils.batch_nested_tensors(trajectory.Trajectory( step_type, observations, actions, (), next_step_type, reward, discount)) # Rewards on the StepType.FIRST should be counted. expected_loss = 10.8935775757 if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss) self.assertAllClose(loss_info.loss, expected_loss)
def _create_batched_trajectory(self, batch_size): return trajectory.Trajectory(observation=(), action=tf.range(batch_size, dtype=tf.int32), policy_info=(), reward=tf.range(batch_size, dtype=tf.float32), discount=tf.ones(batch_size), step_type=ts.StepType.FIRST, next_step_type=ts.StepType.LAST)
def replace_traj_reward(traj, reward): return trajectory.Trajectory(step_type=traj.step_type, observation=traj.observation, action=traj.action, policy_info=traj.policy_info, next_step_type=traj.next_step_type, reward=reward, discount=traj.discount)
def _create_trajectory(self): return trajectory.Trajectory(observation=(), action=(tf.constant(1)), policy_info=(), reward=tf.constant(1.0), discount=tf.constant(1.0), step_type=ts.StepType.FIRST, next_step_type=ts.StepType.LAST)
def load_tfrecord_dataset(dataset_files, buffer_size=1000, as_experience=False, as_trajectories=False): """Loads a TFRecord dataset from file, sequencing samples as Trajectories. Args: dataset_files: List of paths to one or more datasets buffer_size: (int) number of bytes in the read buffer. 0 means no buffering. as_experience: (bool) Returns dataset as a pair of Trajectories. Samples will be shaped as if they had been pulled from a replay buffer with `num_steps=2`. These samples can be fed directly to agent's `train` method. as_trajectories: (bool) Remaps the data into trajectory objects. This should be enabled when the resulting types must be trajectories as expected by agents. Returns: A dataset of type tf.data.Dataset. Samples follow the dataset's spec nested structure. Samples are generated with a leading batch dim of 1 (or 2 if as_experience is enabled). Raises: IOError: One or more of the dataset files does not exist. """ specs = [] for dataset_file in dataset_files: spec_path = dataset_file + _SPEC_FILE_EXTENSION dataset_spec = parse_encoded_spec_from_file(spec_path) specs.append(dataset_spec) if not all([dataset_spec == spec for spec in specs]): raise IOError('One or more of the encoding specs do not match.') decoder = example_encoding.get_example_decoder(specs[0]) logging.info('Loading TFRecord dataset...') dataset = tf.data.TFRecordDataset(dataset_files, buffer_size=buffer_size, num_parallel_reads=len(dataset_files)) def decode_fn(proto): """Decodes a proto object.""" return decoder(proto) def decode_and_batch_fn(proto): """Decodes a proto object, and batch output tensors.""" sample = decoder(proto) return nest_utils.batch_nested_tensors(sample) if as_experience: dataset = dataset.map(decode_fn).batch(2) else: dataset = dataset.map(decode_and_batch_fn) if as_trajectories: as_trajectories_fn = lambda sample: trajectory.Trajectory(*sample) dataset = dataset.map(as_trajectories_fn) return dataset
def item_from_trajectory(self, pb2_trajectory): return trajectory.Trajectory( step_type=np.squeeze(np.frombuffer(pb2_trajectory.step_type, dtype=np.int32)), observation=np.squeeze(np.frombuffer(pb2_trajectory.observation, dtype=np.float32).reshape(self.obs_shape)), action=np.squeeze(np.frombuffer(pb2_trajectory.action, dtype=np.int32)), policy_info=(),#np.frombuffer(pb2_trajectory.policy_info, dtype=np.float32) next_step_type=np.squeeze(np.frombuffer(pb2_trajectory.next_step_type, dtype=np.int32)), reward=np.squeeze(np.frombuffer(pb2_trajectory.reward, dtype=np.float32)), discount=np.squeeze(np.frombuffer(pb2_trajectory.discount, dtype=np.float32)) )
def testTrainMaskingRewardMultipleBanditEpisodes(self): # Test that train reacts correctly to experience when there are multiple # Bandit episodes. Bandit episodes are encoded differently than # MDP episodes. They (each) have only a single transition with # step_type=StepType.FIRST and next_step_type=StepType.LAST. This test # helps ensure that LAST->FIRST->LAST transitions are handled correctly. # # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below. # # Experience looks like this: # Trajectories: (F, L) -> (F, L) # observation : [1, 2] [1, 2] # action : [0] [2] # reward : 3 4 # ~is_boundary: 0 0 # is_last : 1 1 # valid reward: 3*1 4*1 # # All bandit transitions are valid and none are masked. # # The expected_loss is > 0.0 in this case, matching the expected_loss of the # testMaskingRewardMultipleEpisodesRewardOnFirst policy_gradient_loss test. agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet(self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=tf.compat.v1.train.AdamOptimizer(0.001), use_advantage_loss=False, normalize_returns=False, ) step_type = tf.constant([ts.StepType.FIRST, ts.StepType.FIRST]) next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.LAST]) reward = tf.constant([3, 4], dtype=tf.float32) discount = tf.constant([0, 0], dtype=tf.float32) observations = tf.constant([[1, 2], [1, 2]], dtype=tf.float32) actions = tf.constant([[0], [2]], dtype=tf.float32) experience = nest_utils.batch_nested_tensors( trajectory.Trajectory(step_type, observations, actions, (), next_step_type, reward, discount)) # Rewards on the StepType.FIRST should be counted. expected_loss = 12.2091741562 if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_info = self.evaluate(loss) self.assertAllClose(loss_info.loss, expected_loss)
def __init__( self, time_step_spec: ts.TimeStep, action_spec: types.NestedTensorSpec, info_spec: types.NestedTensorSpec ): """Creates a DataContext. Note: The context does not store a state spec, or other information about a Policy's internal state. Policy state is not typically stored in a replay buffer or on disk, except when the policy explicitly chooses to store it by adding the state as a field inside its `info` output. In those cases, the internal policy state spec is represented as part of the `info_spec`. Args: time_step_spec: A nest of `tf.TimeStep` representing the time_steps. action_spec: A nest of `tf.TypeSpec` representing the actions. info_spec: A nest of `tf.TypeSpec` representing the policy's info. (Typically this is the info emitted by the collect policy). Raises: TypeError: If any of the specs are not nests containing tf.TypeSpec objects. """ def _each_isinstance(spec, spec_types): """Checks if each element of `spec` is instance of `spec_types`.""" return all([isinstance(s, spec_types) for s in tf.nest.flatten(spec)]) for (spec, label) in ((time_step_spec, 'time_step_spec'), (action_spec, 'action_spec'), (info_spec, 'info_spec')): if not _each_isinstance(spec, tf.TypeSpec): raise TypeError( '{} has to contain TypeSpec (TensorSpec, ' 'SparseTensorSpec, etc) objects, but received: {}' .format(label, spec)) self._time_step_spec = time_step_spec self._action_spec = action_spec self._info_spec = info_spec self._trajectory_spec = trajectory.Trajectory( step_type=time_step_spec.step_type, observation=time_step_spec.observation, action=action_spec, policy_info=info_spec, next_step_type=time_step_spec.step_type, reward=time_step_spec.reward, discount=time_step_spec.discount) self._transition_spec = trajectory.Transition( time_step=time_step_spec, action_step=policy_step.PolicyStep(action=action_spec, state=(), info=info_spec), next_time_step=time_step_spec)
def trajectory_for_bandit(initial_step, action_step, final_step): import tensorflow as tf from tf_agents.trajectories import trajectory return trajectory.Trajectory( observation=tf.expand_dims(initial_step.observation, 0), action=tf.expand_dims(action_step.action, 0), policy_info=action_step.info, reward=tf.expand_dims(final_step.reward, 0), discount=tf.expand_dims(final_step.discount, 0), step_type=tf.expand_dims(initial_step.step_type, 0), next_step_type=tf.expand_dims(final_step.step_type, 0))
def build_tf_trajectory(traj_dict, data_spec_dict): """ build a trajectory of tensors based on the data and the spec provided Params: traj_dict: dict containing trajectory data stored as numpy data types data_spec_dict: a dict mapping every trajectory data to it's expected TensorSpec Return: tf.trajectory trajectory spec """ traj_tensor_dict = {} traj_spec = {} for field_name, data in traj_dict.items(): traj_tensor_dict[field_name], traj_spec[ field_name] = convert_data_to_tensor(data, data_spec_dict[field_name]) return tj.Trajectory(**traj_tensor_dict), tj.Trajectory(**traj_spec)
def _get_episode(args, start_pos): new_args = {} for k, v in args.items(): if k != 'policy_info': if k == 'observation': new_args[k] = {'pixels': None} new_args[k]['pixels'] = v['pixels'][start_pos::2] else: new_args[k] = v[start_pos::2] else: new_args[k] = v return trajectory.Trajectory(**new_args)
def _create_test_trajectory(self, batch_size): num_actions = tf.cast(batch_size / 2, dtype=tf.int32) action_tensor = tf.concat([ tf.range(num_actions, dtype=tf.int32), tf.range(num_actions, dtype=tf.int32)], axis=-1) return trajectory.Trajectory(observation=tf.ones(batch_size), action=action_tensor, policy_info=(), reward=tf.range(batch_size, dtype=tf.float32), discount=tf.ones(batch_size), step_type=ts.StepType.FIRST, next_step_type=ts.StepType.LAST)
def _create_batched_trajectory_with_reward_dict(self, batch_size): reward_dict = { 'reward': tf.range(batch_size, dtype=tf.float32), 'constraint': tf.range(batch_size, dtype=tf.float32), } return trajectory.Trajectory(observation=(), action=tf.range(batch_size, dtype=tf.int32), policy_info=(), reward=reward_dict, discount=tf.ones(batch_size), step_type=ts.StepType.FIRST, next_step_type=ts.StepType.LAST)
def create_trajectory(state: types.Array, action: types.Array, discount: types.Array, reward: types.Array, step_type: types.Array, next_step_type: types.Array) -> trajectory.Trajectory: """Creates a Trajectory from current and next state information.""" return trajectory.Trajectory(step_type=step_type, observation=state, action=action, policy_info=(), next_step_type=next_step_type, reward=reward, discount=discount)
def testTrainWithRnn(self): action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1) batch_size = 5 observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32) actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.int32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), observation=[observations]) experience = trajectory.Trajectory(step_type=time_steps.step_type, observation=observations, action=actions, policy_info=(), next_step_type=time_steps.step_type, reward=time_steps.reward, discount=time_steps.discount) categorical_q_rnn_network = DummyCategoricalQRnnNetwork( self._obs_spec, action_spec, conv_layer_params=None, input_fc_layer_params=(16, ), preprocessing_combiner=None, lstm_size=(40, ), output_fc_layer_params=(16, ), ) counter = common.create_variable('test_train_counter') agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, action_spec, categorical_q_rnn_network, optimizer=tf.compat.v1.train.AdamOptimizer(0.001), ) # Force variable creation. agent.policy.variables() if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertEqual(self.evaluate(counter), 0) self.evaluate(loss)