def testBatchedPyEnvCompatible(self):
        if not common.has_eager_been_enabled():
            self.skipTest('Only supported in eager.')

        actor_net = actor_network.ActorNetwork(
            self._observation_tensor_spec,
            self._action_tensor_spec,
            fc_layer_params=(10, ),
        )

        tf_policy = actor_policy.ActorPolicy(self._time_step_tensor_spec,
                                             self._action_tensor_spec,
                                             actor_network=actor_net)

        py_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_policy,
                                                       batch_time_steps=False)

        env_ctr = lambda: random_py_environment.RandomPyEnvironment(  # pylint: disable=g-long-lambda
            self._observation_spec, self._action_spec)

        env = batched_py_environment.BatchedPyEnvironment(
            [env_ctr() for _ in range(3)])
        time_step = env.reset()

        for _ in range(20):
            action_step = py_policy.action(time_step)
            time_step = env.step(action_step.action)
    def testPyEnvCompatible(self):
        if not common.has_eager_been_enabled():
            self.skipTest('Only supported in eager.')

        observation_spec = array_spec.ArraySpec([2], np.float32)
        action_spec = array_spec.BoundedArraySpec([1], np.float32, 2, 3)

        observation_tensor_spec = tensor_spec.from_spec(observation_spec)
        action_tensor_spec = tensor_spec.from_spec(action_spec)
        time_step_tensor_spec = ts.time_step_spec(observation_tensor_spec)

        actor_net = actor_network.ActorNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(10, ),
        )

        tf_policy = actor_policy.ActorPolicy(time_step_tensor_spec,
                                             action_tensor_spec,
                                             actor_network=actor_net)

        py_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_policy)
        # Env will validate action types automaticall since we provided the
        # action_spec.
        env = random_py_environment.RandomPyEnvironment(
            observation_spec, action_spec)

        time_step = env.reset()

        for _ in range(100):
            action_step = py_policy.action(time_step)
            time_step = env.step(action_step.action)
Пример #3
0
    def _get_policies(self, time_step_spec, action_spec, cloning_network):
        policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec,
                                          action_spec=action_spec,
                                          actor_network=cloning_network,
                                          clip=True)

        return policy, policy
Пример #4
0
  def testMasking(self):
    batch_size = 1000
    num_state_dims = 5
    num_actions = 8
    observations = tf.random.uniform([batch_size, num_state_dims])
    time_step = ts.restart(observations, batch_size=batch_size)
    input_tensor_spec = tensor_spec.TensorSpec([num_state_dims], tf.float32)
    time_step_spec = ts.time_step_spec(input_tensor_spec)
    action_spec = tensor_spec.BoundedTensorSpec(
        [1], tf.int32, 0, num_actions - 1)

    # We create a fixed mask here for testing purposes. Normally the mask would
    # be part of the observation.
    mask = [0, 1, 0, 1, 0, 0, 1, 0]
    np_mask = np.array(mask)
    tf_mask = tf.constant([mask for _ in range(batch_size)])
    actor_network = actor_distribution_network.ActorDistributionNetwork(
        input_tensor_spec, action_spec, fc_layer_params=(2, 1))
    policy = actor_policy.ActorPolicy(
        time_step_spec, action_spec, actor_network=actor_network,
        observation_and_action_constraint_splitter=(
            lambda observation: (observation, tf_mask)))

    # Force creation of variables before global_variables_initializer.
    policy.variables()
    self.evaluate(tf.compat.v1.global_variables_initializer())

    # Sample from the policy 1000 times, and ensure that actions considered
    # invalid according to the mask are never chosen.
    action_step = policy.action(time_step)
    action = self.evaluate(action_step.action)
    self.assertEqual(action.shape, (batch_size, 1))
    self.assertAllEqual(np_mask[action], np.ones([batch_size, 1]))
    def testRunsWithLstmStack(self, lstm_size, rnn_construction_fn):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(1, 5))

        action_spec = [
            tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3),
            tensor_spec.BoundedTensorSpec((3, ), tf.int32, 0, 3)
        ]

        net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
            observation_spec,
            action_spec,
            conv_layer_params=[(4, 2, 2)],
            input_fc_layer_params=(5, ),
            output_fc_layer_params=(5, ),
            lstm_size=lstm_size,
            rnn_construction_fn=rnn_construction_fn,
            rnn_construction_kwargs={'lstm_size': 3})

        initial_state = actor_policy.ActorPolicy(time_step_spec, action_spec,
                                                 net).get_initial_state(1)
        net_call = net(time_step.observation, time_step.step_type,
                       initial_state)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(tf.nest.map_structure(lambda d: d.sample(), net_call[0]))
  def testHandlePreprocessingLayers(self):
    observation_spec = (tensor_spec.TensorSpec([1], tf.float32),
                        tensor_spec.TensorSpec([], tf.float32))
    time_step_spec = ts.time_step_spec(observation_spec)
    time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(3, 4))

    action_spec = [
        tensor_spec.BoundedTensorSpec((2,), tf.float32, 2, 3),
        tensor_spec.BoundedTensorSpec((3,), tf.int32, 0, 3)
    ]

    preprocessing_layers = (tf.keras.layers.Dense(4),
                            sequential_layer.SequentialLayer([
                                tf.keras.layers.Reshape((1,)),
                                tf.keras.layers.Dense(4)
                            ]))

    net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
        observation_spec,
        action_spec,
        preprocessing_layers=preprocessing_layers,
        preprocessing_combiner=tf.keras.layers.Add())

    initial_state = actor_policy.ActorPolicy(time_step_spec, action_spec,
                                             net).get_initial_state(3)

    action_distributions, _ = net(time_step.observation, time_step.step_type,
                                  initial_state)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertEqual([3, 4, 2], action_distributions[0].mode().shape.as_list())
    self.assertEqual([3, 4, 3], action_distributions[1].mode().shape.as_list())
    self.assertGreater(len(net.trainable_variables), 4)
Пример #7
0
  def testBuild(self, network_ctor):
    actor_network = network_ctor(self._obs_spec, self._action_spec)
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    self.assertEqual(policy.time_step_spec(), self._time_step_spec)
    self.assertEqual(policy.action_spec(), self._action_spec)
    self.assertEqual(policy.variables(), [])
Пример #8
0
  def testGaussianDistribution(self):
    actor_network = DummyActionDistributionNet(self._obs_spec,
                                               self._action_spec)
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    distribution_step = policy.distribution(self._time_step_batch)
    self.assertIsInstance(distribution_step.action, tfp.distributions.Normal)
Пример #9
0
  def testBuild(self):
    actor_network = actor_distribution_network.ActorDistributionNetwork(
        self._obs_spec, self._action_spec, fc_layer_params=(2, 1))
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    self.assertEqual(policy.time_step_spec, self._time_step_spec)
    self.assertEqual(policy.action_spec, self._action_spec)
Пример #10
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 actor_network,
                 optimizer,
                 normalize_returns=True,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 entropy_regularization=None,
                 train_step_counter=None,
                 name=None):
        """Creates a REINFORCE Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      optimizer: Optimizer for the actor network.
      normalize_returns: Whether to normalize returns across episodes when
        computing the loss.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      entropy_regularization: Coefficient for entropy regularization loss term.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        self._actor_network = actor_network

        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=True)

        policy = greedy_policy.GreedyPolicy(collect_policy)

        self._optimizer = optimizer
        self._normalize_returns = normalize_returns
        self._gradient_clipping = gradient_clipping
        self._entropy_regularization = entropy_regularization

        super(ReinforceAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Пример #11
0
def get_d4rl_policy(env, weights, is_dapg=False):
    """Creates TF Agents policy based from D4RL saved weights."""
    hidden_dims = []
    fc_idx = 0
    while 'fc%d/weight' % fc_idx in weights:
        hidden_dims.append(np.shape(weights['fc0/weight'])[0])
        fc_idx += 1

    if is_dapg:
        activation_fn = tf.keras.activations.tanh
        continuous_projection_net = functools.partial(
            normal_projection_network.NormalProjectionNetwork,
            mean_transform=None,
            std_transform=tf.exp,
            state_dependent_std=True)
    else:
        activation_fn = tf.keras.activations.relu
        continuous_projection_net = functools.partial(
            tanh_normal_projection_network.TanhNormalProjectionNetwork,
            std_transform=lambda x: tf.exp(tf.clip_by_value(x, -5., 2.)))

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        env.observation_spec(),
        env.action_spec(),
        fc_layer_params=hidden_dims,
        continuous_projection_net=continuous_projection_net,
        activation_fn=activation_fn)
    policy = actor_policy.ActorPolicy(time_step_spec=env.time_step_spec(),
                                      action_spec=env.action_spec(),
                                      actor_network=actor_net,
                                      training=False)

    # Set weights
    # pylint: disable=protected-access
    for fc_idx in range(len(hidden_dims)):
        actor_net._encoder.layers[fc_idx + 1].set_weights(
            [weights['fc%d/weight' % fc_idx].T, weights['fc%d/bias' % fc_idx]])

    if is_dapg:
        actor_net._projection_networks.layers[0].set_weights(
            [weights['last_fc/weight'].T, weights['last_fc/bias']])
        actor_net._projection_networks.layers[1].set_weights([
            weights['last_fc_log_std/weight'].T,
            weights['last_fc_log_std/bias']
        ])
    else:
        actor_net._projection_networks.layers[0].set_weights([
            np.concatenate(
                (weights['last_fc/weight'], weights['last_fc_log_std/weight']),
                axis=0).T,
            np.concatenate(
                (weights['last_fc/bias'], weights['last_fc_log_std/bias']),
                axis=0)
        ])
    # pylint: enable=protected-access
    return policy
Пример #12
0
 def get_option_policies(self):
     return [
         greedy_policy.GreedyPolicy(
             actor_policy.ActorPolicy(
                 time_step_spec=self.time_step_spec,
                 action_spec=self.action_spec,
                 actor_network=option_net
             )
         )
         for option_net in self.actor_net.get_options()
     ]
Пример #13
0
def get_env_and_policy_from_weights(env_name: str,
                                    weights: Mapping[str, np.ndarray],
                                    n_hidden: int = 300,
                                    min_log_std: float = -5,
                                    max_log_std: float = 2):
  """Return tf_env and policy from dictionary of weights.

  Assumes that the policy has 2 hidden layers with 300 units, ReLu activations,
  and outputs a normal distribution squashed by a Tanh.

  Args:
    env_name: Name of the environment.
    weights: Dictionary of weights containing keys: fc0/weight, fc0/bias,
      fc0/weight, fc0/bias, last_fc/weight, last_fc_log_std/weight,
      last_fc/bias, last_fc_log_std/bias

  Returns:
    tf_env: TF wrapped env.
    policy: TF Agents policy.
  """
  env = suites.load_mujoco(env_name)
  tf_env = tf_py_environment.TFPyEnvironment(env)
  std_transform = (
      lambda x: tf.exp(tf.clip_by_value(x, min_log_std, max_log_std)))
  actor_net = actor_distribution_network.ActorDistributionNetwork(
      tf_env.observation_spec(),
      tf_env.action_spec(),
      fc_layer_params=(n_hidden, n_hidden),
      continuous_projection_net=functools.partial(
          tanh_normal_projection_network.TanhNormalProjectionNetwork,
          std_transform=std_transform),
      activation_fn=tf.keras.activations.relu,
  )
  policy = actor_policy.ActorPolicy(
      time_step_spec=tf_env.time_step_spec(),
      action_spec=tf_env.action_spec(),
      actor_network=actor_net,
      training=False)

  # Set weights
  actor_net._encoder.layers[1].set_weights(  # pylint: disable=protected-access
      [weights['fc0/weight'].T, weights['fc0/bias']])
  actor_net._encoder.layers[2].set_weights(  # pylint: disable=protected-access
      [weights['fc1/weight'].T, weights['fc1/bias']])
  actor_net._projection_networks.layers[0].set_weights(  # pylint: disable=protected-access
      [
          np.concatenate(
              (weights['last_fc/weight'], weights['last_fc_log_std/weight']),
              axis=0).T,
          np.concatenate(
              (weights['last_fc/bias'], weights['last_fc_log_std/bias']),
              axis=0)
      ])
  return tf_env, policy
Пример #14
0
 def setUp(self):
   super(OuNoisePolicyTest, self).setUp()
   self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 2, 3)
   actor_network = DummyActionNet(self._obs_spec, self._action_spec)
   self._wrapped_policy = actor_policy.ActorPolicy(
       time_step_spec=self._time_step_spec,
       action_spec=self._action_spec,
       actor_network=actor_network,
       clip=False)
Пример #15
0
def get_sac_policy(tf_env):
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        fc_layer_params=(256, 256),
        continuous_projection_net=tanh_normal_projection_network.
        TanhNormalProjectionNetwork)
    policy = actor_policy.ActorPolicy(time_step_spec=tf_env.time_step_spec(),
                                      action_spec=tf_env.action_spec(),
                                      actor_network=actor_net,
                                      training=False)
    return policy
Пример #16
0
  def testActionBatch(self, network_ctor):
    actor_network = network_ctor(self._obs_spec, self._action_spec)
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    action_step = policy.action(self._time_step_batch)
    self.assertEqual(action_step.action.shape.as_list(), [2, 1])
    self.assertEqual(action_step.action.dtype, tf.float32)
    self.evaluate(tf.global_variables_initializer())
    actions_ = self.evaluate(action_step.action)
    self.assertTrue(np.all(actions_ >= self._action_spec.minimum))
    self.assertTrue(np.all(actions_ <= self._action_spec.maximum))
Пример #17
0
  def testActionBatch(self):
    actor_network = actor_distribution_network.ActorDistributionNetwork(
        self._obs_spec, self._action_spec, fc_layer_params=(2, 1))
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    action_step = policy.action(self._time_step_batch)
    self.assertEqual(action_step.action.shape.as_list(), [2, 1])
    self.assertEqual(action_step.action.dtype, self._action_spec.dtype)
    self.evaluate(tf.compat.v1.global_variables_initializer())
    actions_ = self.evaluate(action_step.action)
    self.assertTrue(np.all(actions_ >= self._action_spec.minimum))
    self.assertTrue(np.all(actions_ <= self._action_spec.maximum))
Пример #18
0
  def testUpdate(self):
    tf.set_random_seed(1)
    actor_network = DummyActionNet(self._obs_spec, self._action_spec)
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)
    self.assertEqual(policy.variables(), [])
    new_policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    action_step = policy.action(self._time_step_batch)
    self.assertLen(policy.variables(), 2)
    new_action_step = new_policy.action(self._time_step_batch)
    self.assertLen(new_policy.variables(), 2)

    self.assertEqual(action_step.action.shape, new_action_step.action.shape)
    self.assertEqual(action_step.action.dtype, new_action_step.action.dtype)

    self.evaluate(tf.global_variables_initializer())
    self.evaluate(new_policy.update(policy))
    actions_, new_actions_ = self.evaluate(
        [action_step.action, new_action_step.action])
    self.assertAllEqual(actions_, new_actions_)
Пример #19
0
  def testActionDistribution(self):
    actor_network = actor_distribution_network.ActorDistributionNetwork(
        self._obs_spec, self._action_spec, fc_layer_params=(2, 1))
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    # Force creation of variables before global_variables_initializer.
    policy.variables()
    self.evaluate(tf.compat.v1.global_variables_initializer())

    distribution = policy.distribution(self._time_step_batch)
    actions_ = self.evaluate(distribution.action.sample())
    self.assertTrue(np.all(actions_ >= self._action_spec.minimum))
    self.assertTrue(np.all(actions_ <= self._action_spec.maximum))
Пример #20
0
  def testDeterministicDistribution(self):
    actor_network = DummyActionNet(self._obs_spec, self._action_spec)
    policy = actor_policy.ActorPolicy(
        self._time_step_spec, self._action_spec, actor_network=actor_network)

    action_step = policy.action(self._time_step_batch)
    distribution_step = policy.distribution(self._time_step_batch)
    self.assertIsInstance(distribution_step.action,
                          tfp.distributions.Deterministic)
    distribution_mean = distribution_step.action.mean()
    self.evaluate(tf.global_variables_initializer())
    actions_ = self.evaluate(action_step.action)
    distribution_mean_ = self.evaluate(distribution_mean)
    self.assertNear(actions_[0], distribution_mean_[0], 1e-6)
Пример #21
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 actor_network,
                 optimizer,
                 normalize_returns=True,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False):
        """Creates a REINFORCE Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      optimizer: Optimizer for the actor network.
      normalize_returns: Whether to normalize returns across episodes when
        computing the loss.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
    """

        self._actor_network = actor_network
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=True)
        policy = greedy_policy.GreedyPolicy(collect_policy)

        self._optimizer = optimizer
        self._normalize_returns = normalize_returns
        self._gradient_clipping = gradient_clipping

        super(ReinforceAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars)
Пример #22
0
    def testSavedModel(self):
        if not common.has_eager_been_enabled():
            self.skipTest('Only supported in eager.')

        observation_spec = array_spec.ArraySpec([2], np.float32)
        action_spec = array_spec.BoundedArraySpec([1], np.float32, 2, 3)
        time_step_spec = ts.time_step_spec(observation_spec)

        observation_tensor_spec = tensor_spec.from_spec(observation_spec)
        action_tensor_spec = tensor_spec.from_spec(action_spec)
        time_step_tensor_spec = tensor_spec.from_spec(time_step_spec)

        actor_net = actor_network.ActorNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(10, ),
        )

        tf_policy = actor_policy.ActorPolicy(time_step_tensor_spec,
                                             action_tensor_spec,
                                             actor_network=actor_net)

        path = os.path.join(self.get_temp_dir(), 'saved_policy')
        saver = policy_saver.PolicySaver(tf_policy)
        saver.save(path)

        eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
            path, time_step_spec, action_spec)

        rng = np.random.RandomState()
        sample_time_step = array_spec.sample_spec_nest(time_step_spec, rng)
        batched_sample_time_step = nest_utils.batch_nested_array(
            sample_time_step)

        original_action = tf_policy.action(batched_sample_time_step)
        unbatched_original_action = nest_utils.unbatch_nested_tensors(
            original_action)
        original_action_np = tf.nest.map_structure(lambda t: t.numpy(),
                                                   unbatched_original_action)
        saved_policy_action = eager_py_policy.action(sample_time_step)

        tf.nest.assert_same_structure(saved_policy_action.action, action_spec)

        np.testing.assert_array_almost_equal(original_action_np.action,
                                             saved_policy_action.action)
Пример #23
0
  def testActionList(self):
    action_spec = [self._action_spec]
    actor_network = DummyActionNet(self._obs_spec, action_spec)
    self._wrapped_policy = actor_policy.ActorPolicy(
        time_step_spec=self._time_step_spec,
        action_spec=action_spec,
        actor_network=actor_network,
        clip=False)

    policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy)
    action_step = policy.action(self._time_step_batch)
    self.assertEqual(action_step.action[0].shape.as_list(), [2, 1])
    self.assertEqual(action_step.action[0].dtype, tf.float32)
    self.evaluate(tf.global_variables_initializer())
    self.evaluate(tf.local_variables_initializer())
    actions_ = self.evaluate(action_step.action)
    self.assertTrue(np.all(actions_[0] >= self._action_spec.minimum))
    self.assertTrue(np.all(actions_[0] <= self._action_spec.maximum))
Пример #24
0
  def testPolicySaverCompatibility(self):
    observation_spec = tensor_spec.TensorSpec(shape=(100,), dtype=tf.float32)
    action_spec = tensor_spec.TensorSpec(shape=(5,), dtype=tf.float32)
    time_step_tensor_spec = ts.time_step_spec(observation_spec)
    net = ActorNetwork(observation_spec, action_spec)
    net.create_variables()
    policy = actor_policy.ActorPolicy(time_step_tensor_spec, action_spec, net)

    sample = tensor_spec.sample_spec_nest(
        time_step_tensor_spec, outer_dims=(5,))

    policy.action(sample)

    train_step = common.create_variable('train_step')
    saver = policy_saver.PolicySaver(policy, train_step=train_step)
    self.initialize_v1_variables()

    with self.cached_session():
      saver.save(os.path.join(FLAGS.test_tmpdir, 'sequential_layer_model'))
Пример #25
0
  def _setup_as_discrete(self, time_step_spec, action_spec, loss_fn,
                         epsilon_greedy):
    self._bc_loss_fn = loss_fn or self._discrete_loss

    if any(isinstance(d, distribution_utils.DistributionSpecV2) for
           d in tf.nest.flatten([self._network_output_spec])):
      # If the output of the cloning network contains a distribution.
      base_policy = actor_policy.ActorPolicy(time_step_spec, action_spec,
                                             self._cloning_network)
    else:
      # If the output of the cloning network is logits.
      base_policy = q_policy.QPolicy(
          time_step_spec,
          action_spec,
          q_network=self._cloning_network,
          validate_action_spec_and_network=False)
    policy = greedy_policy.GreedyPolicy(base_policy)
    collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
        base_policy, epsilon=epsilon_greedy)
    return policy, collect_policy
    def testPyEnvCompatible(self):
        if not common.has_eager_been_enabled():
            self.skipTest('Only supported in eager.')

        actor_net = actor_network.ActorNetwork(
            self._observation_tensor_spec,
            self._action_tensor_spec,
            fc_layer_params=(10, ),
        )

        tf_policy = actor_policy.ActorPolicy(self._time_step_tensor_spec,
                                             self._action_tensor_spec,
                                             actor_network=actor_net)

        py_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_policy)
        time_step = self._env.reset()

        for _ in range(100):
            action_step = py_policy.action(time_step)
            time_step = self._env.step(action_step.action)
Пример #27
0
  def setUp(self):
    super(SavedModelPYTFEagerPolicyTest, self).setUp()
    if not common.has_eager_been_enabled():
      self.skipTest('Only supported in eager.')

    observation_spec = array_spec.ArraySpec([2], np.float32)
    self.action_spec = array_spec.BoundedArraySpec([1], np.float32, 2, 3)
    self.time_step_spec = ts.time_step_spec(observation_spec)

    observation_tensor_spec = tensor_spec.from_spec(observation_spec)
    action_tensor_spec = tensor_spec.from_spec(self.action_spec)
    time_step_tensor_spec = tensor_spec.from_spec(self.time_step_spec)

    actor_net = actor_network.ActorNetwork(
        observation_tensor_spec,
        action_tensor_spec,
        fc_layer_params=(10,),
    )

    self.tf_policy = actor_policy.ActorPolicy(
        time_step_tensor_spec, action_tensor_spec, actor_network=actor_net)
Пример #28
0
def get_target_policy(load_dir, env_name):
    """Gets target policy."""
    env = tf_py_environment.TFPyEnvironment(suites.load_mujoco(env_name))
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        env.observation_spec(),
        env.action_spec(),
        fc_layer_params=(256, 256),
        continuous_projection_net=tanh_normal_projection_network.
        TanhNormalProjectionNetwork)
    policy = actor_policy.ActorPolicy(time_step_spec=env.time_step_spec(),
                                      action_spec=env.action_spec(),
                                      actor_network=actor_net,
                                      training=False)
    policy = greedy_policy.GreedyPolicy(policy)

    checkpoint = tf.train.Checkpoint(policy=policy)

    directory = os.path.join(load_dir, env_name, 'train/policy')
    checkpoint_filename = tf.train.latest_checkpoint(directory)
    print('Loading policy from %s' % checkpoint_filename)
    checkpoint.restore(checkpoint_filename).assert_existing_objects_matched()
    policy = policy.wrapped_policy

    return policy, env
Пример #29
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 actor_network,
                 critic_network,
                 actor_optimizer,
                 critic_optimizer,
                 ou_stddev=1.0,
                 ou_damping=1.0,
                 target_update_tau=1.0,
                 target_update_period=1,
                 dqda_clipping=None,
                 td_errors_loss_fn=None,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 target_policy_noise=0.2,
                 target_policy_noise_clip=0.5,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False):
        """Creates a Td3Agent Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      critic_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, action, step_type).
      actor_optimizer: The default optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added
        in the default collect policy.
      ou_damping: Damping factor for the OU noise added in the default collect
        policy.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      dqda_clipping: A scalar or float clips the gradient dqda element-wise
        between [-dqda_clipping, dqda_clipping]. Default is None representing no
        clippiing.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      target_policy_noise: Scale factor on target action noise
      target_policy_noise_clip: Value to clip noise.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
    """
        self._actor_network = actor_network
        self._target_actor_network = actor_network.copy(
            name='TargetActorNetwork')

        self._critic_network_1 = critic_network
        self._target_critic_network_1 = critic_network.copy(
            name='TargetCriticNetwork1')

        self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
        self._target_critic_network_2 = critic_network.copy(
            name='TargetCriticNetwork2')

        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        # TODO(kewa): better variable names.
        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._dqda_clipping = dqda_clipping
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common_utils.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_policy_noise = target_policy_noise
        self._target_policy_noise_clip = target_policy_noise_clip
        self._gradient_clipping = gradient_clipping

        policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec,
                                          action_spec=action_spec,
                                          actor_network=self._actor_network,
                                          clip=True)
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=False)
        collect_policy = ou_noise_policy.OUNoisePolicy(
            collect_policy,
            ou_stddev=self._ou_stddev,
            ou_damping=self._ou_damping,
            clip=True)

        super(Td3Agent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=2
                             if not self._actor_network.state_spec else None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars)
Пример #30
0
  def __init__(self,
               time_step_spec,
               action_spec,
               actor_network,
               critic_network,
               actor_optimizer,
               critic_optimizer,
               exploration_noise_std=0.1,
               critic_network_2=None,
               target_actor_network=None,
               target_critic_network=None,
               target_critic_network_2=None,
               target_update_tau=1.0,
               target_update_period=1,
               actor_update_period=1,
               dqda_clipping=None,
               td_errors_loss_fn=None,
               gamma=1.0,
               reward_scale_factor=1.0,
               target_policy_noise=0.2,
               target_policy_noise_clip=0.5,
               gradient_clipping=None,
               debug_summaries=False,
               summarize_grads_and_vars=False,
               train_step_counter=None,
               name=None):
    """Creates a Td3Agent Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      critic_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, action, step_type).
      actor_optimizer: The default optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      exploration_noise_std: Scale factor on exploration policy noise.
      critic_network_2: (Optional.)  A `tf_agents.network.Network` to be used as
        the second critic network during Q learning.  The weights from
        `critic_network` are copied if this is not provided.
      target_actor_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the target actor network during Q learning. Every
        `target_update_period` train steps, the weights from `actor_network` are
        copied (possibly withsmoothing via `target_update_tau`) to `
        target_actor_network`.  If `target_actor_network` is not provided, it is
        created by making a copy of `actor_network`, which initializes a new
        network with the same structure and its own layers and weights.
        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or when
        the network is sharing layers with another).  In these cases, it is up
        to you to build a copy having weights that are not shared with the
        original `actor_network`, so that this can be used as a target network.
        If you provide a `target_actor_network` that shares any weights with
        `actor_network`, a warning will be logged but no exception is thrown.
      target_critic_network: (Optional.) Similar network as target_actor_network
        but for the critic_network. See documentation for target_actor_network.
      target_critic_network_2: (Optional.) Similar network as
        target_actor_network but for the critic_network_2. See documentation for
        target_actor_network. Will only be used if 'critic_network_2' is also
        specified.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      actor_update_period: Period for the optimization step on actor network.
      dqda_clipping: A scalar or float clips the gradient dqda element-wise
        between [-dqda_clipping, dqda_clipping]. Default is None representing no
        clippiing.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      target_policy_noise: Scale factor on target action noise
      target_policy_noise_clip: Value to clip noise.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.
    """
    tf.Module.__init__(self, name=name)
    self._actor_network = actor_network
    actor_network.create_variables()
    if target_actor_network:
      target_actor_network.create_variables()
    self._target_actor_network = common.maybe_copy_target_network_with_checks(
        self._actor_network, target_actor_network, 'TargetActorNetwork')

    self._critic_network_1 = critic_network
    critic_network.create_variables()
    if target_critic_network:
      target_critic_network.create_variables()
    self._target_critic_network_1 = (
        common.maybe_copy_target_network_with_checks(self._critic_network_1,
                                                     target_critic_network,
                                                     'TargetCriticNetwork1'))

    if critic_network_2 is not None:
      self._critic_network_2 = critic_network_2
    else:
      self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
      # Do not use target_critic_network_2 if critic_network_2 is None.
      target_critic_network_2 = None
    self._critic_network_2.create_variables()
    if target_critic_network_2:
      target_critic_network_2.create_variables()
    self._target_critic_network_2 = (
        common.maybe_copy_target_network_with_checks(self._critic_network_2,
                                                     target_critic_network_2,
                                                     'TargetCriticNetwork2'))

    self._actor_optimizer = actor_optimizer
    self._critic_optimizer = critic_optimizer

    self._exploration_noise_std = exploration_noise_std
    self._target_update_tau = target_update_tau
    self._target_update_period = target_update_period
    self._actor_update_period = actor_update_period
    self._dqda_clipping = dqda_clipping
    self._td_errors_loss_fn = (
        td_errors_loss_fn or common.element_wise_huber_loss)
    self._gamma = gamma
    self._reward_scale_factor = reward_scale_factor
    self._target_policy_noise = target_policy_noise
    self._target_policy_noise_clip = target_policy_noise_clip
    self._gradient_clipping = gradient_clipping

    self._update_target = self._get_target_updater(target_update_tau,
                                                   target_update_period)

    policy = actor_policy.ActorPolicy(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        actor_network=self._actor_network,
        clip=True)
    collect_policy = actor_policy.ActorPolicy(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        actor_network=self._actor_network,
        clip=False)
    collect_policy = gaussian_policy.GaussianPolicy(
        collect_policy, scale=self._exploration_noise_std, clip=True)

    super(Td3Agent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy,
        train_sequence_length=2 if not self._actor_network.state_spec else None,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)