示例#1
0
  def testBatchedEnvironment(self, max_steps, max_episodes, expected_length):

    expected_trajectories = [
        trajectory.Trajectory(
            step_type=np.array([0, 0]),
            observation=np.array([0, 0]),
            action=np.array([2, 1]),
            policy_info=np.array([4, 2]),
            next_step_type=np.array([1, 1]),
            reward=np.array([1., 1.]),
            discount=np.array([1., 1.])),
        trajectory.Trajectory(
            step_type=np.array([1, 1]),
            observation=np.array([2, 1]),
            action=np.array([1, 2]),
            policy_info=np.array([2, 4]),
            next_step_type=np.array([2, 1]),
            reward=np.array([1., 1.]),
            discount=np.array([0., 1.])),
        trajectory.Trajectory(
            step_type=np.array([2, 1]),
            observation=np.array([3, 3]),
            action=np.array([2, 1]),
            policy_info=np.array([4, 2]),
            next_step_type=np.array([0, 2]),
            reward=np.array([0., 1.]),
            discount=np.array([1., 0.]))
    ]

    env1 = driver_test_utils.PyEnvironmentMock(final_state=3)
    env2 = driver_test_utils.PyEnvironmentMock(final_state=4)
    env = batched_py_environment.BatchedPyEnvironment([env1, env2])
    tf_env = tf_py_environment.TFPyEnvironment(env)

    policy = driver_test_utils.TFPolicyMock(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        batch_size=2,
        initial_policy_state=tf.constant([1, 2], dtype=tf.int32))

    replay_buffer_observer = MockReplayBufferObserver()

    driver = tf_driver.TFDriver(
        tf_env,
        policy,
        observers=[replay_buffer_observer],
        max_steps=max_steps,
        max_episodes=max_episodes,
    )
    initial_time_step = tf_env.reset()
    initial_policy_state = tf.constant([1, 2], dtype=tf.int32)
    self.evaluate(driver.run(initial_time_step, initial_policy_state))
    trajectories = replay_buffer_observer.gather_all()

    self.assertEqual(
        len(trajectories), len(expected_trajectories[:expected_length]))

    for t1, t2 in zip(trajectories, expected_trajectories[:expected_length]):
      for t1_field, t2_field in zip(t1, t2):
        self.assertAllEqual(t1_field, t2_field)
示例#2
0
    def test_collect_data_spec_trajectory(self):
        episode_dict = {
            'states':
            np.array([[1., 2.], [3., 4.], [5., 6.], [7., 8.]],
                     dtype=np.float32),
            'actions':
            np.array([[1.], [2.], [3.], [4.]], dtype=np.float32),
            'rewards':
            np.array([[0.], [1.], [0.], [1.]], dtype=np.float32),
            'discounts':
            np.array([1.0, 0.0, 1.0, 0.0], dtype=np.float32),
            'episode_start_index':
            np.array([0, 2], dtype=np.int32)
        }

        expected_spec = trajectory.Trajectory(
            step_type=ArraySpec(shape=[], dtype=np.int32),
            observation=ArraySpec(shape=[2], dtype=np.float32),
            action=ArraySpec(shape=[1], dtype=np.float32),
            policy_info=(),
            next_step_type=ArraySpec(shape=[], dtype=np.int32),
            reward=ArraySpec(shape=[1], dtype=np.float32),
            discount=ArraySpec(shape=[], dtype=np.float32))
        actual_spec = create_collect_data_spec(episode_dict,
                                               use_trajectories=True)
        self.assertEqual(actual_spec, expected_spec)
示例#3
0
  def testAverageReturnMultiMetricTimeMisalignment(
      self, run_mode, num_trajectories, reward_spec, expected_result):
    with run_mode():
      trajectories = self._create_misaligned_trajectories()
      multi_trajectories = []
      for traj in trajectories:
        if isinstance(reward_spec, list):
          new_reward = [traj.reward, traj.reward]
        else:
          new_reward = tf.stack([traj.reward, traj.reward], axis=1)
        new_traj = trajectory.Trajectory(
            step_type=traj.step_type,
            observation=traj.observation,
            action=traj.action,
            policy_info=traj.policy_info,
            next_step_type=traj.next_step_type,
            reward=new_reward,
            discount=traj.discount)
        multi_trajectories.append(new_traj)

      metric = tf_metrics.AverageReturnMultiMetric(reward_spec, batch_size=2)
      self.evaluate(tf.compat.v1.global_variables_initializer())
      self.evaluate(metric.init_variables())
      for i in range(num_trajectories):
        self.evaluate(metric(multi_trajectories[i]))

      self.assertAllEqual(expected_result, self.evaluate(metric.result()))
      self.evaluate(metric.reset())
      self.assertAllEqual([0.0, 0.0], self.evaluate(metric.result()))
    def testToNStepTransitionForNEquals1(self):
        first = ts.StepType.FIRST
        last = ts.StepType.LAST

        # Define a batch size 1, 2-step trajectory.
        traj = trajectory.Trajectory(
            step_type=np.array([[first, last]]),
            next_step_type=np.array([[last, first]]),
            observation=np.array([[10.0, 20.0]]),
            action=np.array([[11.0, 22.0]]),
            # reward & discount values at step 1 is an invalid dummy reward.
            reward=np.array([[-1.0, 0.0]]),
            discount=np.array([[0.9, 0.0]]),
            policy_info=np.array([[10.0, 20.0]]))

        transition = trajectory.to_n_step_transition(traj, gamma=0.5)
        self.assertIsInstance(transition, trajectory.Transition)
        time_steps, policy_steps, next_time_steps = transition

        self.assertAllEqual(time_steps.step_type, np.array([first]))
        self.assertAllEqual(time_steps.observation, np.array([10.0]))
        self.assertAllEqual(time_steps.reward, np.array([np.nan]))
        self.assertAllEqual(time_steps.discount, np.array([np.nan]))

        self.assertAllEqual(next_time_steps.step_type, np.array([last]))
        self.assertAllEqual(next_time_steps.observation, np.array([20.0]))
        # r0
        self.assertAllEqual(next_time_steps.reward, np.array([-1.0]))
        # d0
        self.assertAllEqual(next_time_steps.discount, np.array([0.9]))

        self.assertAllEqual(policy_steps.action, np.array([11.0]))
        self.assertAllEqual(policy_steps.info, np.array([10.0]))
    def testAgentTrajectoryTrain(self):
        agent = td3_agent.Td3Agent(
            self._time_step_spec,
            self._action_spec,
            critic_network=self._critic_net,
            actor_network=self._bounded_actor_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        )

        trajectory_spec = trajectory.Trajectory(
            step_type=self._time_step_spec.step_type,
            observation=self._time_step_spec.observation,
            action=self._action_spec,
            policy_info=(),
            next_step_type=self._time_step_spec.step_type,
            reward=tensor_spec.BoundedTensorSpec([],
                                                 tf.float32,
                                                 minimum=0.0,
                                                 maximum=1.0,
                                                 name='reward'),
            discount=self._time_step_spec.discount)

        sample_trajectory_experience = tensor_spec.sample_spec_nest(
            trajectory_spec, outer_dims=(3, 2))
        agent.train(sample_trajectory_experience)
示例#6
0
  def setUp(self):
    super(ReverbReplayBufferTest, self).setUp()

    # Prepare the environment (and the corresponding specs).
    self._env = test_envs.EpisodeCountingEnv(steps_per_episode=3)
    tensor_time_step_spec = tf.nest.map_structure(tensor_spec.from_spec,
                                                  self._env.time_step_spec())
    tensor_action_spec = tensor_spec.from_spec(self._env.action_spec())
    self._data_spec = trajectory.Trajectory(
        step_type=tensor_time_step_spec.step_type,
        observation=tensor_time_step_spec.observation,
        action=tensor_action_spec,
        policy_info=(),
        next_step_type=tensor_time_step_spec.step_type,
        reward=tensor_time_step_spec.reward,
        discount=tensor_time_step_spec.discount,
    )
    table_spec = tf.nest.map_structure(
        lambda s: tf.TensorSpec(dtype=s.dtype, shape=(None,) + s.shape),
        self._data_spec)
    self._array_data_spec = tensor_spec.to_nest_array_spec(self._data_spec)

    # Initialize and start a Reverb server (and set up a client to it).
    self._table_name = 'test_table'
    uniform_table = reverb.Table(
        self._table_name,
        max_size=100,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=table_spec,
    )
    self._server = reverb.Server([uniform_table])
    self._py_client = reverb.Client('localhost:{}'.format(self._server.port))
示例#7
0
  def testWithAdvantageFn(self, with_value_network):
    advantage_fn = mock.Mock(
        side_effect=lambda returns, _: returns)

    value_network = (DummyValueNet(self._obs_spec) if with_value_network
                     else None)
    agent = reinforce_agent.ReinforceAgent(
        self._time_step_spec,
        self._action_spec,
        actor_network=DummyActorNet(
            self._obs_spec, self._action_spec, unbounded_actions=False),
        value_network=value_network,
        advantage_fn=advantage_fn,
        optimizer=None,
    )

    step_type = tf.constant([[ts.StepType.FIRST, ts.StepType.LAST,
                              ts.StepType.FIRST, ts.StepType.LAST]])
    next_step_type = tf.constant([[ts.StepType.LAST, ts.StepType.FIRST,
                                   ts.StepType.LAST, ts.StepType.FIRST]])
    reward = tf.constant([[0, 0, 0, 0]], dtype=tf.float32)
    discount = tf.constant([[1, 1, 1, 1]], dtype=tf.float32)
    observations = tf.constant(
        [[[1, 2], [1, 2], [1, 2], [1, 2]]], dtype=tf.float32)
    actions = tf.constant([[[0], [1], [2], [3]]], dtype=tf.float32)

    experience = trajectory.Trajectory(
        step_type, observations, actions, (), next_step_type, reward, discount)

    agent.total_loss(experience, reward, None)

    advantage_fn.assert_called_once()
示例#8
0
def _create_trajectories(n_time_steps, batch_size):
    # Observation looks like:
    # [[ 0.,  1., ... n_time_steps.],
    #  [10., 11., ... n_time_steps.],
    #  [20., 21., ... n_time_steps.],
    #  [ ...                       ],
    #  [10*batch_size., ... 10*batch_size+n_time_steps.]]
    observation_array = np.asarray(
        [np.arange(n_time_steps) + 10 * i for i in range(batch_size)])
    observations = tf.convert_to_tensor(observation_array, dtype=tf.float32)

    default_tensor = tf.constant([[1] * n_time_steps] * batch_size,
                                 dtype=tf.float32)
    mid_time_step_val = ts.StepType.MID.tolist()
    time_steps = ts.TimeStep(step_type=tf.constant(
        [[mid_time_step_val] * n_time_steps] * batch_size, dtype=tf.int32),
                             reward=default_tensor,
                             discount=default_tensor,
                             observation=observations)
    actions = tf.constant([[[1]] * n_time_steps] * batch_size,
                          dtype=tf.float32)
    policy_info = {
        'dist_params': {
            'loc':
            tf.constant([[[1]] * n_time_steps] * batch_size, dtype=tf.float32),
            'scale':
            tf.constant([[[1]] * n_time_steps] * batch_size, dtype=tf.float32)
        },
        'value_prediction': default_tensor,
        'return': default_tensor,
        'advantage': default_tensor,
    }
    return trajectory.Trajectory(time_steps.step_type, observations, actions,
                                 policy_info, time_steps.step_type,
                                 time_steps.reward, time_steps.discount)
 def testProcessExperienceGlobalFeatures(self):
     observation_spec = {
         'f1': tf.TensorSpec(shape=(5, ), dtype=tf.string),
         'f2': tf.TensorSpec(shape=(5, 2), dtype=tf.int32)
     }
     time_step_spec = time_step.time_step_spec(observation_spec)
     training_data_spec = trajectory.Trajectory(
         step_type=time_step_spec.step_type,
         observation=time_step_spec.observation,
         action=tensor_spec.BoundedTensorSpec(shape=(),
                                              minimum=0,
                                              maximum=4,
                                              dtype=tf.int32),
         policy_info=(),
         next_step_type=time_step_spec.step_type,
         reward=tensor_spec.BoundedTensorSpec(shape=(),
                                              minimum=0,
                                              maximum=2,
                                              dtype=tf.float32),
         discount=time_step_spec.discount)
     experience = tensor_spec.sample_spec_nest(training_data_spec,
                                               outer_dims=(7, 2))
     observation, action, reward = utils.process_experience_for_neural_agents(
         experience, False, training_data_spec)
     self.assertAllEqual(observation['f1'][0],
                         experience.observation['f1'][0, 0])
     self.assertEqual(action[0], experience.action[0, 0])
     self.assertEqual(reward[0], experience.reward[0, 0])
示例#10
0
    def testToTransition(self):
        first = ts.StepType.FIRST
        mid = ts.StepType.MID
        last = ts.StepType.LAST

        # Define a batch size 1, 3-step trajectory.
        traj = trajectory.Trajectory(
            step_type=np.array([[first, mid, last]]),
            next_step_type=np.array([[mid, last, first]]),
            observation=np.array([[10.0, 20.0, 30.0]]),
            action=np.array([[11.0, 22.0, 33.0]]),
            # reward at step 0 is an invalid dummy reward.
            reward=np.array([[0.0, 1.0, 2.0]]),
            discount=np.array([[1.0, 1.0, 0.0]]),
            policy_info=np.array([[1.0, 2.0, 3.0]]))

        time_steps, policy_steps, next_time_steps = trajectory.to_transition(
            traj)

        self.assertAllEqual(time_steps.step_type, np.array([[first, mid]]))
        self.assertAllEqual(time_steps.observation, np.array([[10.0, 20.0]]))

        self.assertAllEqual(next_time_steps.step_type, np.array([[mid, last]]))
        self.assertAllEqual(next_time_steps.observation,
                            np.array([[20.0, 30.0]]))
        self.assertAllEqual(next_time_steps.reward, np.array([[0.0, 1.0]]))
        self.assertAllEqual(next_time_steps.discount, np.array([[1.0, 1.0]]))

        self.assertAllEqual(policy_steps.action, np.array([[11.0, 22.0]]))
        self.assertAllEqual(policy_steps.info, np.array([[1.0, 2.0]]))
示例#11
0
    def testAgentTrajectoryTrain(self):
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self._obs_spec,
            self._action_spec,
            fc_layer_params=(10, ),
            continuous_projection_net=tanh_normal_projection_network.
            TanhNormalProjectionNetwork)

        agent = sac_agent.SacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=DummyCriticNet(),
            actor_network=actor_net,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(0.001))

        trajectory_spec = trajectory.Trajectory(
            step_type=self._time_step_spec.step_type,
            observation=self._time_step_spec.observation,
            action=self._action_spec,
            policy_info=(),
            next_step_type=self._time_step_spec.step_type,
            reward=tensor_spec.BoundedTensorSpec([],
                                                 tf.float32,
                                                 minimum=0.0,
                                                 maximum=1.0,
                                                 name='reward'),
            discount=self._time_step_spec.discount)

        sample_trajectory_experience = tensor_spec.sample_spec_nest(
            trajectory_spec, outer_dims=(3, 2))
        agent.train(sample_trajectory_experience)
示例#12
0
    def testTrainMaskingRewardMultipleEpisodesRewardOnLast(self):
        # Test that train reacts correctly to experience when there are:
        #   * Multiple MDP episodes
        #   * Rewards on the tf.StepType.LAST transitions
        #
        # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.
        #
        # Experience looks like this:
        # Trajectories: (F, L) -> (L, F) -> (F, L) -> (L, F)
        # observation : [1, 2]    [1, 2]    [1, 2]    [1, 2]
        # action      :   [0]       [1]       [2]       [3]
        # reward      :    0         3         0         4
        # ~is_boundary:    1         0         1         0
        # is_last     :    1         0         1         0
        # valid reward:   0*1       3*0       0*1       4*0
        #
        # The second & fourth action & reward should be masked out due to being on a
        # boundary (step_type=(L, F)) transition.
        #
        # The expected_loss is = 0.0 in this case.
        agent = reinforce_agent.ReinforceAgent(
            self._time_step_spec,
            self._action_spec,
            actor_network=DummyActorNet(self._obs_spec,
                                        self._action_spec,
                                        unbounded_actions=True),
            optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            use_advantage_loss=False,
            normalize_returns=False,
        )

        step_type = tf.constant([
            ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST,
            ts.StepType.LAST
        ])
        next_step_type = tf.constant([
            ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST,
            ts.StepType.FIRST
        ])
        reward = tf.constant([0, 3, 0, 4], dtype=tf.float32)
        discount = tf.constant([1, 0, 1, 0], dtype=tf.float32)
        observations = tf.constant([[1, 2], [1, 2], [1, 2], [1, 2]],
                                   dtype=tf.float32)
        actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32)

        experience = nest_utils.batch_nested_tensors(
            trajectory.Trajectory(step_type, observations, actions, (),
                                  next_step_type, reward, discount))

        # Rewards on the StepType.LAST should be counted.
        expected_loss = 0.0

        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_info = self.evaluate(loss)
        self.assertAllClose(loss_info.loss, expected_loss)
示例#13
0
    def _create_experience(_):
        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)
        mid_time_step_val = ts.StepType.MID.tolist()
        time_steps = ts.TimeStep(step_type=tf.constant(
            [[mid_time_step_val] * 3] * 2, dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }
        value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                                  dtype=tf.float32)

        policy_info = {
            'dist_params': action_distribution_parameters,
        }
        policy_info['value_prediction'] = value_preds
        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)
        return agent._preprocess(experience)  # pylint: disable=protected-access
示例#14
0
    def testTrain(self, num_epochs, use_td_lambda_return):
        with tf.compat.v2.summary.record_if(False):
            # Mock the build_train_op to return an op for incrementing this counter.
            counter = common.create_variable('test_train_counter')
            agent = ppo_agent.PPOAgent(
                self._time_step_spec,
                self._action_spec,
                tf.compat.v1.train.AdamOptimizer(),
                actor_net=DummyActorNet(
                    self._obs_spec,
                    self._action_spec,
                ),
                value_net=DummyValueNet(self._obs_spec),
                normalize_observations=False,
                num_epochs=num_epochs,
                use_gae=use_td_lambda_return,
                use_td_lambda_return=use_td_lambda_return,
                train_step_counter=counter)
            observations = tf.constant([
                [[1, 2], [3, 4], [5, 6]],
                [[1, 2], [3, 4], [5, 6]],
            ],
                                       dtype=tf.float32)

            time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2,
                                                           dtype=tf.int32),
                                     reward=tf.constant([[1] * 3] * 2,
                                                        dtype=tf.float32),
                                     discount=tf.constant([[1] * 3] * 2,
                                                          dtype=tf.float32),
                                     observation=observations)
            actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                                  dtype=tf.float32)

            action_distribution_parameters = {
                'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
                'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
            }

            policy_info = action_distribution_parameters

            experience = trajectory.Trajectory(
                time_steps.step_type, observations, actions, policy_info,
                time_steps.step_type, time_steps.reward, time_steps.discount)

            # Force variable creation.
            agent.policy.variables()

            if tf.executing_eagerly():
                loss = lambda: agent.train(experience)
            else:
                loss = agent.train(experience)

            # Assert that counter starts out at zero.
            self.evaluate(tf.compat.v1.initialize_all_variables())
            self.assertEqual(0, self.evaluate(counter))
            self.evaluate(loss)
            # Assert that train_op ran increment_counter num_epochs times.
            self.assertEqual(num_epochs, self.evaluate(counter))
示例#15
0
  def testTrainWithRnn(self):
    actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
        self._obs_spec,
        self._action_spec,
        input_fc_layer_params=None,
        output_fc_layer_params=None,
        conv_layer_params=None,
        lstm_size=(40,),
    )

    critic_net = critic_rnn_network.CriticRnnNetwork(
        (self._obs_spec, self._action_spec),
        observation_fc_layer_params=(16,),
        action_fc_layer_params=(16,),
        joint_fc_layer_params=(16,),
        lstm_size=(16,),
        output_fc_layer_params=None,
    )

    counter = common.create_variable('test_train_counter')

    optimizer_fn = tf.compat.v1.train.AdamOptimizer

    agent = sac_agent.SacAgent(
        self._time_step_spec,
        self._action_spec,
        critic_network=critic_net,
        actor_network=actor_net,
        actor_optimizer=optimizer_fn(1e-3),
        critic_optimizer=optimizer_fn(1e-3),
        alpha_optimizer=optimizer_fn(1e-3),
        train_step_counter=counter,
    )

    batch_size = 5
    observations = tf.constant(
        [[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32)
    actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32)
    time_steps = ts.TimeStep(
        step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32),
        observation=[observations])

    experience = trajectory.Trajectory(
        time_steps.step_type, [observations], actions, (),
        time_steps.step_type, time_steps.reward, time_steps.discount)

    # Force variable creation.
    agent.policy.variables()
    if tf.executing_eagerly():
      loss = lambda: agent.train(experience)
    else:
      loss = agent.train(experience)

    self.evaluate(tf.compat.v1.initialize_all_variables())
    self.assertEqual(self.evaluate(counter), 0)
    self.evaluate(loss)
    self.assertEqual(self.evaluate(counter), 1)
示例#16
0
  def testTrainMaskingPartialEpisodeMultipleEpisodesRewardOnFirst(self):
    # Test that train reacts correctly to experience when there are:
    #   * Multiple MDP episodes
    #   * Rewards on the tf.StepType.FIRST transitions
    #   * Partial episode at end of experience
    #
    # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.
    #
    # Experience looks like this:
    # Trajectories: (F, L) -> (L, F) -> (F, M) -> (M, M)
    # observation : [1, 2]    [1, 2]    [1, 2]    [1, 2]
    # action      :   [0]       [1]       [2]       [3]
    # reward      :    3         0         4         0
    # ~is_boundary:    1         0         1         1
    # is_last     :    1         0         0         0
    # valid reward:   3*1       0*0       4*0       0*0
    #
    # The second action & reward should be masked out due to being on a
    # boundary (step_type=(L, F)) transition.  The third & fourth transitions
    # should get masked out for everything due to it being an incomplete episode
    # (notice there is no trailing step_type=(F,L)).
    #
    # The expected_loss is > 0.0 in this case, matching the expected_loss of the
    # testMaskingRewardSingleEpisodeRewardOnFirst policy_gradient_loss test,
    # because the partial second episode should be masked out.
    agent = reinforce_agent.ReinforceAgent(
        self._time_step_spec,
        self._action_spec,
        actor_network=DummyActorNet(
            self._obs_spec, self._action_spec, unbounded_actions=True),
        optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        use_advantage_loss=False,
        normalize_returns=False,
    )

    step_type = tf.constant([ts.StepType.FIRST, ts.StepType.LAST,
                             ts.StepType.FIRST, ts.StepType.MID])
    next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.FIRST,
                                  ts.StepType.MID, ts.StepType.MID])
    reward = tf.constant([3, 0, 4, 0], dtype=tf.float32)
    discount = tf.constant([1, 0, 1, 0], dtype=tf.float32)
    observations = tf.constant(
        [[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32)
    actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32)

    experience = nest_utils.batch_nested_tensors(trajectory.Trajectory(
        step_type, observations, actions, (), next_step_type, reward, discount))

    # Rewards on the StepType.FIRST should be counted.
    expected_loss = 10.8935775757

    if tf.executing_eagerly():
      loss = lambda: agent.train(experience)
    else:
      loss = agent.train(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    loss_info = self.evaluate(loss)
    self.assertAllClose(loss_info.loss, expected_loss)
示例#17
0
 def _create_batched_trajectory(self, batch_size):
   return trajectory.Trajectory(observation=(),
                                action=tf.range(batch_size, dtype=tf.int32),
                                policy_info=(),
                                reward=tf.range(batch_size, dtype=tf.float32),
                                discount=tf.ones(batch_size),
                                step_type=ts.StepType.FIRST,
                                next_step_type=ts.StepType.LAST)
def replace_traj_reward(traj, reward):
    return trajectory.Trajectory(step_type=traj.step_type,
                                 observation=traj.observation,
                                 action=traj.action,
                                 policy_info=traj.policy_info,
                                 next_step_type=traj.next_step_type,
                                 reward=reward,
                                 discount=traj.discount)
示例#19
0
 def _create_trajectory(self):
     return trajectory.Trajectory(observation=(),
                                  action=(tf.constant(1)),
                                  policy_info=(),
                                  reward=tf.constant(1.0),
                                  discount=tf.constant(1.0),
                                  step_type=ts.StepType.FIRST,
                                  next_step_type=ts.StepType.LAST)
def load_tfrecord_dataset(dataset_files,
                          buffer_size=1000,
                          as_experience=False,
                          as_trajectories=False):
    """Loads a TFRecord dataset from file, sequencing samples as Trajectories.

  Args:
    dataset_files: List of paths to one or more datasets
    buffer_size: (int) number of bytes in the read buffer. 0 means no buffering.
    as_experience: (bool) Returns dataset as a pair of Trajectories. Samples
      will be shaped as if they had been pulled from a replay buffer with
      `num_steps=2`. These samples can be fed directly to agent's `train`
      method.
    as_trajectories: (bool) Remaps the data into trajectory objects. This should
      be enabled when the resulting types must be trajectories as expected by
      agents.

  Returns:
    A dataset of type tf.data.Dataset. Samples follow the dataset's spec nested
    structure. Samples are generated with a leading batch dim of 1
    (or 2 if as_experience is enabled).
  Raises:
    IOError: One or more of the dataset files does not exist.
  """

    specs = []
    for dataset_file in dataset_files:
        spec_path = dataset_file + _SPEC_FILE_EXTENSION
        dataset_spec = parse_encoded_spec_from_file(spec_path)
        specs.append(dataset_spec)
        if not all([dataset_spec == spec for spec in specs]):
            raise IOError('One or more of the encoding specs do not match.')
    decoder = example_encoding.get_example_decoder(specs[0])
    logging.info('Loading TFRecord dataset...')
    dataset = tf.data.TFRecordDataset(dataset_files,
                                      buffer_size=buffer_size,
                                      num_parallel_reads=len(dataset_files))

    def decode_fn(proto):
        """Decodes a proto object."""
        return decoder(proto)

    def decode_and_batch_fn(proto):
        """Decodes a proto object, and batch output tensors."""
        sample = decoder(proto)
        return nest_utils.batch_nested_tensors(sample)

    if as_experience:
        dataset = dataset.map(decode_fn).batch(2)
    else:
        dataset = dataset.map(decode_and_batch_fn)

    if as_trajectories:
        as_trajectories_fn = lambda sample: trajectory.Trajectory(*sample)
        dataset = dataset.map(as_trajectories_fn)
    return dataset
示例#21
0
 def item_from_trajectory(self, pb2_trajectory):
   return trajectory.Trajectory(
     step_type=np.squeeze(np.frombuffer(pb2_trajectory.step_type, dtype=np.int32)),
     observation=np.squeeze(np.frombuffer(pb2_trajectory.observation, dtype=np.float32).reshape(self.obs_shape)),
     action=np.squeeze(np.frombuffer(pb2_trajectory.action, dtype=np.int32)),
     policy_info=(),#np.frombuffer(pb2_trajectory.policy_info, dtype=np.float32)
     next_step_type=np.squeeze(np.frombuffer(pb2_trajectory.next_step_type, dtype=np.int32)),
     reward=np.squeeze(np.frombuffer(pb2_trajectory.reward, dtype=np.float32)),
     discount=np.squeeze(np.frombuffer(pb2_trajectory.discount, dtype=np.float32))
   )
示例#22
0
    def testTrainMaskingRewardMultipleBanditEpisodes(self):
        # Test that train reacts correctly to experience when there are multiple
        # Bandit episodes.  Bandit episodes are encoded differently than
        # MDP episodes.  They (each) have only a single transition with
        # step_type=StepType.FIRST and next_step_type=StepType.LAST.  This test
        # helps ensure that LAST->FIRST->LAST transitions are handled correctly.
        #
        # F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.
        #
        # Experience looks like this:
        # Trajectories: (F, L) -> (F, L)
        # observation : [1, 2]    [1, 2]
        # action      :   [0]       [2]
        # reward      :    3         4
        # ~is_boundary:    0         0
        # is_last     :    1         1
        # valid reward:   3*1       4*1
        #
        # All bandit transitions are valid and none are masked.
        #
        # The expected_loss is > 0.0 in this case, matching the expected_loss of the
        # testMaskingRewardMultipleEpisodesRewardOnFirst policy_gradient_loss test.
        agent = reinforce_agent.ReinforceAgent(
            self._time_step_spec,
            self._action_spec,
            actor_network=DummyActorNet(self._obs_spec,
                                        self._action_spec,
                                        unbounded_actions=True),
            optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
            use_advantage_loss=False,
            normalize_returns=False,
        )

        step_type = tf.constant([ts.StepType.FIRST, ts.StepType.FIRST])
        next_step_type = tf.constant([ts.StepType.LAST, ts.StepType.LAST])
        reward = tf.constant([3, 4], dtype=tf.float32)
        discount = tf.constant([0, 0], dtype=tf.float32)
        observations = tf.constant([[1, 2], [1, 2]], dtype=tf.float32)
        actions = tf.constant([[0], [2]], dtype=tf.float32)

        experience = nest_utils.batch_nested_tensors(
            trajectory.Trajectory(step_type, observations, actions, (),
                                  next_step_type, reward, discount))

        # Rewards on the StepType.FIRST should be counted.
        expected_loss = 12.2091741562

        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        loss_info = self.evaluate(loss)
        self.assertAllClose(loss_info.loss, expected_loss)
  def __init__(
      self,
      time_step_spec: ts.TimeStep,
      action_spec: types.NestedTensorSpec,
      info_spec: types.NestedTensorSpec
  ):
    """Creates a DataContext.

    Note: The context does not store a state spec, or other information about
    a Policy's internal state.  Policy state is not typically stored in a
    replay buffer or on disk, except when the policy explicitly chooses to
    store it by adding the state as a field inside its `info` output.  In
    those cases, the internal policy state spec is represented as part of the
    `info_spec`.

    Args:
      time_step_spec: A nest of `tf.TimeStep` representing the time_steps.
      action_spec: A nest of `tf.TypeSpec` representing the actions.
      info_spec: A nest of `tf.TypeSpec` representing the policy's info.
        (Typically this is the info emitted by the collect policy).

    Raises:
      TypeError: If any of the specs are not nests containing tf.TypeSpec
        objects.
    """
    def _each_isinstance(spec, spec_types):
      """Checks if each element of `spec` is instance of `spec_types`."""
      return all([isinstance(s, spec_types) for s in tf.nest.flatten(spec)])

    for (spec, label) in ((time_step_spec, 'time_step_spec'),
                          (action_spec, 'action_spec'),
                          (info_spec, 'info_spec')):
      if not _each_isinstance(spec, tf.TypeSpec):
        raise TypeError(
            '{} has to contain TypeSpec (TensorSpec, '
            'SparseTensorSpec, etc) objects, but received: {}'
            .format(label, spec))

    self._time_step_spec = time_step_spec
    self._action_spec = action_spec
    self._info_spec = info_spec
    self._trajectory_spec = trajectory.Trajectory(
        step_type=time_step_spec.step_type,
        observation=time_step_spec.observation,
        action=action_spec,
        policy_info=info_spec,
        next_step_type=time_step_spec.step_type,
        reward=time_step_spec.reward,
        discount=time_step_spec.discount)
    self._transition_spec = trajectory.Transition(
        time_step=time_step_spec,
        action_step=policy_step.PolicyStep(action=action_spec,
                                           state=(),
                                           info=info_spec),
        next_time_step=time_step_spec)
示例#24
0
def trajectory_for_bandit(initial_step, action_step, final_step):
    import tensorflow as tf
    from tf_agents.trajectories import trajectory

    return trajectory.Trajectory(
        observation=tf.expand_dims(initial_step.observation, 0),
        action=tf.expand_dims(action_step.action, 0),
        policy_info=action_step.info,
        reward=tf.expand_dims(final_step.reward, 0),
        discount=tf.expand_dims(final_step.discount, 0),
        step_type=tf.expand_dims(initial_step.step_type, 0),
        next_step_type=tf.expand_dims(final_step.step_type, 0))
def build_tf_trajectory(traj_dict, data_spec_dict):
    """
    build a trajectory of tensors based on the data and the spec provided

    Params:
        traj_dict: dict containing trajectory data stored as numpy data types
        data_spec_dict: a dict mapping every trajectory data to it's expected TensorSpec
    Return:
        tf.trajectory
        trajectory spec
    """

    traj_tensor_dict = {}
    traj_spec = {}

    for field_name, data in traj_dict.items():
        traj_tensor_dict[field_name], traj_spec[
            field_name] = convert_data_to_tensor(data,
                                                 data_spec_dict[field_name])

    return tj.Trajectory(**traj_tensor_dict), tj.Trajectory(**traj_spec)
def _get_episode(args, start_pos):
    new_args = {}
    for k, v in args.items():
        if k != 'policy_info':
            if k == 'observation':
                new_args[k] = {'pixels': None}
                new_args[k]['pixels'] = v['pixels'][start_pos::2]
            else:
                new_args[k] = v[start_pos::2]
        else:
            new_args[k] = v
    return trajectory.Trajectory(**new_args)
示例#27
0
 def _create_test_trajectory(self, batch_size):
   num_actions = tf.cast(batch_size / 2, dtype=tf.int32)
   action_tensor = tf.concat([
       tf.range(num_actions, dtype=tf.int32),
       tf.range(num_actions, dtype=tf.int32)], axis=-1)
   return trajectory.Trajectory(observation=tf.ones(batch_size),
                                action=action_tensor,
                                policy_info=(),
                                reward=tf.range(batch_size, dtype=tf.float32),
                                discount=tf.ones(batch_size),
                                step_type=ts.StepType.FIRST,
                                next_step_type=ts.StepType.LAST)
示例#28
0
 def _create_batched_trajectory_with_reward_dict(self, batch_size):
   reward_dict = {
       'reward': tf.range(batch_size, dtype=tf.float32),
       'constraint': tf.range(batch_size, dtype=tf.float32),
   }
   return trajectory.Trajectory(observation=(),
                                action=tf.range(batch_size, dtype=tf.int32),
                                policy_info=(),
                                reward=reward_dict,
                                discount=tf.ones(batch_size),
                                step_type=ts.StepType.FIRST,
                                next_step_type=ts.StepType.LAST)
示例#29
0
def create_trajectory(state: types.Array, action: types.Array,
                      discount: types.Array, reward: types.Array,
                      step_type: types.Array,
                      next_step_type: types.Array) -> trajectory.Trajectory:
    """Creates a Trajectory from current and next state information."""
    return trajectory.Trajectory(step_type=step_type,
                                 observation=state,
                                 action=action,
                                 policy_info=(),
                                 next_step_type=next_step_type,
                                 reward=reward,
                                 discount=discount)
示例#30
0
    def testTrainWithRnn(self):
        action_spec = tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)

        batch_size = 5
        observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size,
                                   dtype=tf.float32)
        actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.int32)
        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * batch_size,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * batch_size,
                                                      dtype=tf.float32),
                                 observation=[observations])

        experience = trajectory.Trajectory(step_type=time_steps.step_type,
                                           observation=observations,
                                           action=actions,
                                           policy_info=(),
                                           next_step_type=time_steps.step_type,
                                           reward=time_steps.reward,
                                           discount=time_steps.discount)

        categorical_q_rnn_network = DummyCategoricalQRnnNetwork(
            self._obs_spec,
            action_spec,
            conv_layer_params=None,
            input_fc_layer_params=(16, ),
            preprocessing_combiner=None,
            lstm_size=(40, ),
            output_fc_layer_params=(16, ),
        )

        counter = common.create_variable('test_train_counter')

        agent = categorical_dqn_agent.CategoricalDqnAgent(
            self._time_step_spec,
            action_spec,
            categorical_q_rnn_network,
            optimizer=tf.compat.v1.train.AdamOptimizer(0.001),
        )

        # Force variable creation.
        agent.policy.variables()
        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertEqual(self.evaluate(counter), 0)
        self.evaluate(loss)