def testSaveWrappedPolicyRestoreOuterCheckAssertConsumed(self, batch_size=5):

    actor_policy_save_path = os.path.join(self.get_temp_dir(),
                                          'actor_policy', str(batch_size))
    noise_policy_save_path = os.path.join(self.get_temp_dir(),
                                          'noise_policy', str(batch_size))

    # Construct a policy to be saved under a tf.Graph instance.
    policy_saved_graph = tf.Graph()
    with policy_saved_graph.as_default():
      actor_network = DummyActionNet(self._obs_spec, self._float_action_spec)
      wrapped_policy = actor_policy.ActorPolicy(
          time_step_spec=self._time_step_spec,
          action_spec=self._float_action_spec,
          actor_network=actor_network,
          clip=False)
      tf_policy = ou_noise_policy.OUNoisePolicy(wrapped_policy)

      # Save the exploration policy and the wrapped actor policy.
      actor_policy_saved = py_tf_policy.PyTFPolicy(wrapped_policy)
      noise_policy_saved = py_tf_policy.PyTFPolicy(tf_policy)
      for policy_saved, policy_save_path in zip(
          [actor_policy_saved, noise_policy_saved],
          [actor_policy_save_path, noise_policy_save_path]):
        policy_saved.session = tf.compat.v1.Session(graph=policy_saved_graph)
        policy_saved.initialize(batch_size)
        policy_saved.save(policy_dir=policy_save_path, graph=policy_saved_graph)

    # Construct a policy to be restored under another tf.Graph instance.
    policy_restore_graph = tf.Graph()
    with policy_restore_graph.as_default():
      actor_network = DummyActionNet(self._obs_spec, self._float_action_spec)
      wrapped_policy = actor_policy.ActorPolicy(
          time_step_spec=self._time_step_spec,
          action_spec=self._float_action_spec,
          actor_network=actor_network,
          clip=False)
      tf_policy = ou_noise_policy.OUNoisePolicy(wrapped_policy)

      policy_restored = py_tf_policy.PyTFPolicy(tf_policy)
      policy_restored.session = tf.compat.v1.Session(graph=policy_restore_graph)
      policy_restored.initialize(batch_size)
      # 1). Restoring the same noise policy as was saved.
      policy_restored.restore(
          policy_dir=noise_policy_save_path, graph=policy_restore_graph)
      # 2). Restoring the actor policy inside of the noise policy. While the
      # graph for policy restore contains additional local variable for the
      # OUNoise, if there is no checking that checkpoint was consumed, this
      # also works.
      policy_restored.restore(
          policy_dir=actor_policy_save_path, graph=policy_restore_graph,
          assert_consumed=False)
      # 3). Restoring the actor policy while checking that all variables in
      # the checkpoint were found in the graph should fail.
      with self.assertRaisesRegexp(
          AssertionError,
          'Some Python objects were not bound to checkpointed values*'):
        policy_restored.restore(
            policy_dir=actor_policy_save_path,
            graph=policy_restore_graph)
Пример #2
0
 def testActionIsInRange(self):
   policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy)
   action_step = policy.action(self._time_step_batch)
   self.assertEqual(action_step.action.shape.as_list(), [2, 1])
   self.assertEqual(action_step.action.dtype, tf.float32)
   self.evaluate(tf.global_variables_initializer())
   self.evaluate(tf.local_variables_initializer())
   actions_ = self.evaluate(action_step.action)
   self.assertTrue(np.all(actions_ >= self._action_spec.minimum))
   self.assertTrue(np.all(actions_ <= self._action_spec.maximum))
Пример #3
0
  def testActionAddsOUNoise(self):
    policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy, clip=False)
    action_step = policy.action(self._time_step_batch)
    wrapped_action_step = self._wrapped_policy.action(self._time_step_batch)

    self.evaluate(tf.global_variables_initializer())
    self.evaluate(tf.local_variables_initializer())
    actions_ = self.evaluate(action_step.action)
    wrapped_policy_actions_ = self.evaluate(wrapped_action_step.action)

    self.assertTrue(np.linalg.norm(actions_ - wrapped_policy_actions_) > 0)
Пример #4
0
  def testActionList(self):
    action_spec = [self._action_spec]
    actor_network = DummyActionNet(self._obs_spec, action_spec)
    self._wrapped_policy = actor_policy.ActorPolicy(
        time_step_spec=self._time_step_spec,
        action_spec=action_spec,
        actor_network=actor_network,
        clip=False)

    policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy)
    action_step = policy.action(self._time_step_batch)
    self.assertEqual(action_step.action[0].shape.as_list(), [2, 1])
    self.assertEqual(action_step.action[0].dtype, tf.float32)
    self.evaluate(tf.global_variables_initializer())
    self.evaluate(tf.local_variables_initializer())
    actions_ = self.evaluate(action_step.action)
    self.assertTrue(np.all(actions_[0] >= self._action_spec.minimum))
    self.assertTrue(np.all(actions_[0] <= self._action_spec.maximum))
Пример #5
0
 def testBuild(self):
   policy = ou_noise_policy.OUNoisePolicy(self._wrapped_policy)
   self.assertEqual(policy.time_step_spec(), self._time_step_spec)
   self.assertEqual(policy.action_spec(), self._action_spec)
   self.assertEqual(len(policy.variables()), 2)
Пример #6
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 actor_network,
                 critic_network,
                 actor_optimizer,
                 critic_optimizer,
                 ou_stddev=1.0,
                 ou_damping=1.0,
                 target_update_tau=1.0,
                 target_update_period=1,
                 dqda_clipping=None,
                 td_errors_loss_fn=None,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 target_policy_noise=0.2,
                 target_policy_noise_clip=0.5,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False):
        """Creates a Td3Agent Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      critic_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, action, step_type).
      actor_optimizer: The default optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added
        in the default collect policy.
      ou_damping: Damping factor for the OU noise added in the default collect
        policy.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      dqda_clipping: A scalar or float clips the gradient dqda element-wise
        between [-dqda_clipping, dqda_clipping]. Default is None representing no
        clippiing.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      target_policy_noise: Scale factor on target action noise
      target_policy_noise_clip: Value to clip noise.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
    """
        self._actor_network = actor_network
        self._target_actor_network = actor_network.copy(
            name='TargetActorNetwork')

        self._critic_network_1 = critic_network
        self._target_critic_network_1 = critic_network.copy(
            name='TargetCriticNetwork1')

        self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
        self._target_critic_network_2 = critic_network.copy(
            name='TargetCriticNetwork2')

        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        # TODO(kewa): better variable names.
        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._dqda_clipping = dqda_clipping
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common_utils.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_policy_noise = target_policy_noise
        self._target_policy_noise_clip = target_policy_noise_clip
        self._gradient_clipping = gradient_clipping

        policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec,
                                          action_spec=action_spec,
                                          actor_network=self._actor_network,
                                          clip=True)
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=False)
        collect_policy = ou_noise_policy.OUNoisePolicy(
            collect_policy,
            ou_stddev=self._ou_stddev,
            ou_damping=self._ou_damping,
            clip=True)

        super(Td3Agent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=2
                             if not self._actor_network.state_spec else None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars)
Пример #7
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 actor_network,
                 critic_network,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 ou_stddev=1.0,
                 ou_damping=1.0,
                 target_actor_network=None,
                 target_critic_network=None,
                 target_update_tau=1.0,
                 target_update_period=1,
                 dqda_clipping=None,
                 td_errors_loss_fn=None,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 name=None):
        """Creates a DDPG Agent.
    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type[, policy_state])
        and should return (action, new_state).
      critic_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call((observation, action), step_type[,
        policy_state]) and should return (q_value, new_state).
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The optimizer to use for the critic network.
      ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added
        in the default collect policy.
      ou_damping: Damping factor for the OU noise added in the default collect
        policy.
      target_actor_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the actor target network during Q learning.  Every
        `target_update_period` train steps, the weights from `actor_network` are
        copied (possibly withsmoothing via `target_update_tau`) to `
        target_q_network`.
        If `target_actor_network` is not provided, it is created by making a
        copy of `actor_network`, which initializes a new network with the same
        structure and its own layers and weights.
        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or
        when the network is sharing layers with another).  In these cases, it is
        up to you to build a copy having weights that are not
        shared with the original `actor_network`, so that this can be used as a
        target network.  If you provide a `target_actor_network` that shares any
        weights with `actor_network`, a warning will be logged but no exception
        is thrown.
      target_critic_network: (Optional.) Similar network as target_actor_network
         but for the critic_network. See documentation for target_actor_network.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      dqda_clipping: when computing the actor loss, clips the gradient dqda
        element-wise between [-dqda_clipping, dqda_clipping]. Does not perform
        clipping if dqda_clipping == 0.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._actor_network = actor_network
        actor_network.create_variables()
        if target_actor_network:
            target_actor_network.create_variables()
        self._target_actor_network = common.maybe_copy_target_network_with_checks(
            self._actor_network, target_actor_network, 'TargetActorNetwork')
        self._critic_network = critic_network
        critic_network.create_variables()
        if target_critic_network:
            target_critic_network.create_variables()
        self._target_critic_network = common.maybe_copy_target_network_with_checks(
            self._critic_network, target_critic_network, 'TargetCriticNetwork')

        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._dqda_clipping = dqda_clipping
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._gradient_clipping = gradient_clipping

        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)
        """Nitty: change time_step_spec to that of individual agent from total spec"""
        individual_time_step_spec = ts.get_individual_time_step_spec(
            time_step_spec)
        policy = actor_policy.ActorPolicy(
            time_step_spec=individual_time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=True)
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=individual_time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=False)

        # policy = actor_policy.ActorPolicy(
        #     time_step_spec=time_step_spec, action_spec=action_spec,
        #     actor_network=self._actor_network, clip=True)
        # collect_policy = actor_policy.ActorPolicy(
        #     time_step_spec=time_step_spec, action_spec=action_spec,
        #     actor_network=self._actor_network, clip=False)
        collect_policy = ou_noise_policy.OUNoisePolicy(
            collect_policy,
            ou_stddev=self._ou_stddev,
            ou_damping=self._ou_damping,
            clip=True)

        super(DdpgAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=2
                             if not self._actor_network.state_spec else None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Пример #8
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 load_model_path=None,
                 save_model_path=None,
                 ou_stddev=0.0,
                 ou_damping=1.0,
                 target_update_tau=0.05,
                 target_update_period=5,
                 max_episode_steps=None,
                 ensemble_size=3,
                 combine_ensemble_method='min',
                 distance_type='distributional'):
        tf.Module.__init__(self, name='UvfAgent')
        assert max_episode_steps is not None
        self._max_episode_steps = max_episode_steps
        self._ensemble_size = ensemble_size
        self._distance_type = distance_type

        self._actor_network = GoalConditionedActorNetwork(
            time_step_spec.observation, action_spec)
        self._target_actor_network = self._actor_network.copy(
            name='TargetActorNetwork')

        critic_net_input_specs = (time_step_spec.observation, action_spec)
        critic_network = GoalConditionedCriticNetwork(
            critic_net_input_specs,
            output_dim=max_episode_steps
            if distance_type == 'distributional' else None)
        self._critic_network_list = []
        self._target_critic_network_list = []
        for ensemble_index in range(self._ensemble_size):
            self._critic_network_list.append(
                critic_network.copy(name='CriticNetwork%d' % ensemble_index))
            self._target_critic_network_list.append(
                critic_network.copy(name='TargetCriticNetwork%d' %
                                    ensemble_index))

        net_list = [
            self._actor_network, self._target_actor_network
        ] + self._critic_network_list + self._target_critic_network_list
        for net in net_list:
            net.create_variables()

        self._actor_optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=1e-4)
        self._critic_optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=1e-4)

        self._train_iter = tf.Variable(0)
        mix_dict = self.model_variable
        self.load_model(load_model_path, save_model_path, mix_dict)

        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period

        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)

        policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec,
                                          action_spec=action_spec,
                                          actor_network=self._actor_network,
                                          clip=True)
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=False)
        # noise x = (1-damping)*x + N(0,std)
        collect_policy = ou_noise_policy.OUNoisePolicy(
            collect_policy,
            ou_stddev=self._ou_stddev,
            ou_damping=self._ou_damping,
            clip=True)

        super(UvfAgent, self).__init__(time_step_spec,
                                       action_spec,
                                       policy,
                                       collect_policy,
                                       train_sequence_length=2)
Пример #9
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 actor_network,
                 critic_network,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 ou_stddev=1.0,
                 ou_damping=1.0,
                 target_update_tau=1.0,
                 target_update_period=1,
                 dqda_clipping=None,
                 td_errors_loss_fn=None,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 name=None):
        """Creates a DDPG Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      critic_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, action, step_type).
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The optimizer to use for the critic network.
      ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added
        in the default collect policy.
      ou_damping: Damping factor for the OU noise added in the default collect
        policy.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      dqda_clipping: when computing the actor loss, clips the gradient dqda
        element-wise between [-dqda_clipping, dqda_clipping]. Does not perform
        clipping if dqda_clipping == 0.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of  elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._actor_network = actor_network
        self._target_actor_network = self._actor_network.copy(
            name='TargetActorNetwork')

        self._critic_network = critic_network
        self._target_critic_network = self._critic_network.copy(
            name='TargetCriticNetwork')

        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._dqda_clipping = dqda_clipping
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._gradient_clipping = gradient_clipping

        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)

        policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec,
                                          action_spec=action_spec,
                                          actor_network=self._actor_network,
                                          clip=True)
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=False)
        collect_policy = ou_noise_policy.OUNoisePolicy(
            collect_policy,
            ou_stddev=self._ou_stddev,
            ou_damping=self._ou_damping,
            clip=True)

        super(DdpgAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=2
                             if not self._actor_network.state_spec else None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Пример #10
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 ou_stddev=1.0,
                 ou_damping=1.0,
                 target_update_tau=0.05,
                 target_update_period=5,
                 max_episode_steps=None,
                 ensemble_size=3,
                 combine_ensemble_method='min',
                 use_distributional_rl=True):
        """Creates a Uvf Agent.

		Args:
			time_step_spec: A `TimeStep` spec of the expected time_steps.
			action_spec: A nest of BoundedTensorSpec representing the actions.
			ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added
				in the default collect policy.
			ou_damping: Damping factor for the OU noise added in the default collect
				policy.
			target_update_tau: Factor for soft update of the target networks.
			target_update_period: Period for soft update of the target networks.
			max_episode_steps: Int indicating number of steps in an episode. Used for
				determining the number of bins for distributional RL.
			ensemble_size: (int) Number of models in ensemble of critics.
			combine_ensemble_method: (str) At test time, how to combine the distances
				predicted by each member of the ensemble. Options are 'mean', 'min',
				and 'td3'. The 'td3' option is pessimistic w.r.t. the pdf, and then
				takes computes the corresponding distance. The 'min' option takes the
				minimum q values, corresponding to taking the maximum predicted
				distance. Note that we never aggregate predictions during training.
			use_distributional_rl: (bool) Whether to use distributional RL.
		"""
        tf.Module.__init__(self, name='UvfAgent')

        assert max_episode_steps is not None
        self._max_episode_steps = max_episode_steps
        self._ensemble_size = ensemble_size
        self._use_distributional_rl = use_distributional_rl

        # Create the actor
        self._actor_network = GoalConditionedActorNetwork(
            time_step_spec.observation, action_spec)
        self._target_actor_network = self._actor_network.copy(
            name='TargetActorNetwork')

        # Create a prototypical critic, which we will copy to create the ensemble.
        critic_net_input_specs = (time_step_spec.observation, action_spec)
        critic_network = GoalConditionedCriticNetwork(
            critic_net_input_specs,
            output_dim=max_episode_steps if use_distributional_rl else None,
        )

        self._critic_network_list = []
        self._target_critic_network_list = []
        for ensemble_index in range(self._ensemble_size):
            self._critic_network_list.append(
                critic_network.copy(name='CriticNetwork%d' % ensemble_index))
            self._target_critic_network_list.append(
                critic_network.copy(name='TargetCriticNetwork%d' %
                                    ensemble_index))

        self._actor_optimizer = tf.train.AdamOptimizer(learning_rate=3e-4)
        self._critic_optimizer = tf.train.AdamOptimizer(learning_rate=3e-4)

        self._ou_stddev = ou_stddev
        self._ou_damping = ou_damping
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period

        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)

        policy = actor_policy.ActorPolicy(time_step_spec=time_step_spec,
                                          action_spec=action_spec,
                                          actor_network=self._actor_network,
                                          clip=True)
        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            clip=False)
        collect_policy = ou_noise_policy.OUNoisePolicy(
            collect_policy,
            ou_stddev=self._ou_stddev,
            ou_damping=self._ou_damping,
            clip=True)

        super(UvfAgent, self).__init__(time_step_spec,
                                       action_spec,
                                       policy,
                                       collect_policy,
                                       train_sequence_length=2)
Пример #11
0
def main(_):
    # setting up
    start_time = time.time()
    tf.compat.v1.enable_resource_variables()
    tf.compat.v1.disable_eager_execution()
    logging.set_verbosity(logging.INFO)
    global observation_omit_size, goal_coord, sample_count, iter_count, episode_size_buffer, episode_return_buffer

    root_dir = os.path.abspath(os.path.expanduser(FLAGS.logdir))
    if not tf.io.gfile.exists(root_dir):
        tf.io.gfile.makedirs(root_dir)
    log_dir = os.path.join(root_dir, FLAGS.environment)

    if not tf.io.gfile.exists(log_dir):
        tf.io.gfile.makedirs(log_dir)
    save_dir = os.path.join(log_dir, "models")
    if not tf.io.gfile.exists(save_dir):
        tf.io.gfile.makedirs(save_dir)

    print("directory for recording experiment data:", log_dir)

    # in case training is paused and resumed, so can be restored
    try:
        sample_count = np.load(os.path.join(log_dir,
                                            "sample_count.npy")).tolist()
        iter_count = np.load(os.path.join(log_dir, "iter_count.npy")).tolist()
        episode_size_buffer = np.load(
            os.path.join(log_dir, "episode_size_buffer.npy")).tolist()
        episode_return_buffer = np.load(
            os.path.join(log_dir, "episode_return_buffer.npy")).tolist()
    except:
        sample_count = 0
        iter_count = 0
        episode_size_buffer = []
        episode_return_buffer = []

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        os.path.join(log_dir, "train", "in_graph_data"),
        flush_millis=10 * 1000)
    train_summary_writer.set_as_default()

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(True):
        # environment related stuff
        env = do.get_environment(env_name=FLAGS.environment)
        py_env = wrap_env(
            skill_wrapper.SkillWrapper(
                env,
                num_latent_skills=FLAGS.num_skills,
                skill_type=FLAGS.skill_type,
                preset_skill=None,
                min_steps_before_resample=FLAGS.min_steps_before_resample,
                resample_prob=FLAGS.resample_prob,
            ),
            max_episode_steps=FLAGS.max_env_steps,
        )

        # all specifications required for all networks and agents
        py_action_spec = py_env.action_spec()
        tf_action_spec = tensor_spec.from_spec(
            py_action_spec)  # policy, critic action spec
        env_obs_spec = py_env.observation_spec()
        py_env_time_step_spec = ts.time_step_spec(
            env_obs_spec)  # replay buffer time_step spec
        if observation_omit_size > 0:
            agent_obs_spec = array_spec.BoundedArraySpec(
                (env_obs_spec.shape[0] - observation_omit_size, ),
                env_obs_spec.dtype,
                minimum=env_obs_spec.minimum,
                maximum=env_obs_spec.maximum,
                name=env_obs_spec.name,
            )  # policy, critic observation spec
        else:
            agent_obs_spec = env_obs_spec
        py_agent_time_step_spec = ts.time_step_spec(
            agent_obs_spec)  # policy, critic time_step spec
        tf_agent_time_step_spec = tensor_spec.from_spec(
            py_agent_time_step_spec)

        if not FLAGS.reduced_observation:
            skill_dynamics_observation_size = (
                py_env_time_step_spec.observation.shape[0] - FLAGS.num_skills)
        else:
            skill_dynamics_observation_size = FLAGS.reduced_observation

        # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            tf_agent_time_step_spec.observation,
            tf_action_spec,
            fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
            continuous_projection_net=do._normal_projection_net,
        )

        critic_net = critic_network.CriticNetwork(
            (tf_agent_time_step_spec.observation, tf_action_spec),
            observation_fc_layer_params=None,
            action_fc_layer_params=None,
            joint_fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
        )

        if (FLAGS.skill_dynamics_relabel_type is not None
                and "importance_sampling" in FLAGS.skill_dynamics_relabel_type
                and FLAGS.is_clip_eps > 1.0):
            reweigh_batches_flag = True
        else:
            reweigh_batches_flag = False

        agent = dads_agent.DADSAgent(
            # DADS parameters
            save_dir,
            skill_dynamics_observation_size,
            observation_modify_fn=do.process_observation,
            restrict_input_size=observation_omit_size,
            latent_size=FLAGS.num_skills,
            latent_prior=FLAGS.skill_type,
            prior_samples=FLAGS.random_skills,
            fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
            normalize_observations=FLAGS.normalize_data,
            network_type=FLAGS.graph_type,
            num_mixture_components=FLAGS.num_components,
            fix_variance=FLAGS.fix_variance,
            reweigh_batches=reweigh_batches_flag,
            skill_dynamics_learning_rate=FLAGS.skill_dynamics_lr,
            # SAC parameters
            time_step_spec=tf_agent_time_step_spec,
            action_spec=tf_action_spec,
            actor_network=actor_net,
            critic_network=critic_net,
            target_update_tau=0.005,
            target_update_period=1,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gamma=FLAGS.agent_gamma,
            reward_scale_factor=1.0 / (FLAGS.agent_entropy + 1e-12),
            gradient_clipping=None,
            debug_summaries=FLAGS.debug,
            train_step_counter=global_step,
        )

        # evaluation policy
        eval_policy = py_tf_policy.PyTFPolicy(agent.policy)

        # collection policy
        if FLAGS.collect_policy == "default":
            collect_policy = py_tf_policy.PyTFPolicy(agent.collect_policy)
        elif FLAGS.collect_policy == "ou_noise":
            collect_policy = py_tf_policy.PyTFPolicy(
                ou_noise_policy.OUNoisePolicy(agent.collect_policy,
                                              ou_stddev=0.2,
                                              ou_damping=0.15))

        # relabelling policy deals with batches of data, unlike collect and eval
        relabel_policy = py_tf_policy.PyTFPolicy(agent.collect_policy)

        # constructing a replay buffer, need a python spec
        policy_step_spec = policy_step.PolicyStep(action=py_action_spec,
                                                  state=(),
                                                  info=())

        if (FLAGS.skill_dynamics_relabel_type is not None
                and "importance_sampling" in FLAGS.skill_dynamics_relabel_type
                and FLAGS.is_clip_eps > 1.0):
            policy_step_spec = policy_step_spec._replace(
                info=policy_step.set_log_probability(
                    policy_step_spec.info,
                    array_spec.ArraySpec(
                        shape=(), dtype=np.float32, name="action_log_prob"),
                ))

        trajectory_spec = from_transition(py_env_time_step_spec,
                                          policy_step_spec,
                                          py_env_time_step_spec)
        capacity = FLAGS.replay_buffer_capacity
        # for all the data collected
        rbuffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
            capacity=capacity, data_spec=trajectory_spec)

        if FLAGS.train_skill_dynamics_on_policy:
            # for on-policy data (if something special is required)
            on_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
                capacity=FLAGS.initial_collect_steps + FLAGS.collect_steps +
                10,
                data_spec=trajectory_spec,
            )

        # insert experience manually with relabelled rewards and skills
        agent.build_agent_graph()
        agent.build_skill_dynamics_graph()
        agent.create_savers()

        # saving this way requires the saver to be out the object
        train_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "agent"),
            agent=agent,
            global_step=global_step,
        )
        policy_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "policy"),
            policy=agent.policy,
            global_step=global_step,
        )
        rb_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "replay_buffer"),
            max_to_keep=1,
            replay_buffer=rbuffer,
        )

        setup_time = time.time() - start_time
        print("Setup time:", setup_time)

        with tf.compat.v1.Session().as_default() as sess:
            eval_policy.session = sess
            eval_policy.initialize(None)
            eval_policy.restore(os.path.join(FLAGS.logdir, "models", "policy"))

            plotdir = os.path.join(FLAGS.logdir, "plots")
            if not os.path.exists(plotdir):
                os.mkdir(plotdir)
            do.FLAGS = FLAGS
            do.eval_loop(eval_dir=plotdir,
                         eval_policy=eval_policy,
                         plot_name="plot")