Пример #1
0
    def __init__(self,
                 state_shape,
                 action_dim,
                 is_discrete,
                 actor=None,
                 critic=None,
                 actor_critic=None,
                 max_action=1.,
                 actor_units=[256, 256],
                 critic_units=[256, 256],
                 lr_actor=1e-3,
                 lr_critic=3e-3,
                 fix_std=False,
                 const_std=0.3,
                 hidden_activation_actor="relu",
                 hidden_activation_critic="relu",
                 name="VPG",
                 **kwargs):
        super().__init__(name=name, **kwargs)
        self._is_discrete = is_discrete

        # TODO: clean codes
        if actor_critic is not None:
            self.actor_critic = actor_critic
            self.actor_critic_optimizer = tf.keras.optimizers.Adam(
                learning_rate=lr_actor)
            self.actor = None
            self.critic = None
        else:
            self.actor_critic = None
            if actor is None:
                if is_discrete:
                    self.actor = CategoricalActor(state_shape, action_dim,
                                                  actor_units)
                else:
                    self.actor = GaussianActor(
                        state_shape,
                        action_dim,
                        max_action,
                        actor_units,
                        hidden_activation=hidden_activation_actor,
                        fix_std=fix_std,
                        const_std=const_std,
                        state_independent_std=True)
            else:
                self.actor = actor
            if critic is None:
                self.critic = CriticV(
                    state_shape,
                    critic_units,
                    hidden_activation=hidden_activation_critic)
            else:
                self.critic = critic
            self.actor_optimizer = tf.keras.optimizers.Adam(
                learning_rate=lr_actor)
            self.critic_optimizer = tf.keras.optimizers.Adam(
                learning_rate=lr_critic)

        # This is used to check if input state to `get_action` is multiple (batch) or single
        self._state_ndim = np.array(state_shape).shape[0]
Пример #2
0
 def __init__(self,
              state_shape,
              action_dim,
              is_discrete,
              max_action=1.,
              actor_units=[256, 256],
              critic_units=[256, 256],
              lr_actor=1e-3,
              lr_critic=3e-3,
              fix_std=False,
              tanh_std=False,
              const_std=0.3,
              name="VPG",
              **kwargs):
     super().__init__(name=name, **kwargs)
     self._is_discrete = is_discrete
     if is_discrete:
         self.actor = CategoricalActor(state_shape, action_dim, actor_units)
     else:
         self.actor = GaussianActor(state_shape,
                                    action_dim,
                                    max_action,
                                    actor_units,
                                    fix_std=fix_std,
                                    tanh_std=tanh_std,
                                    const_std=const_std)
     self.critic = CriticV(state_shape, critic_units)
     self._action_dim = action_dim
     self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_actor)
     self.critic_optimizer = tf.keras.optimizers.Adam(
         learning_rate=lr_critic)
Пример #3
0
 def setUpClass(cls):
     super().setUpClass()
     cls.policy = GaussianActor(
         state_shape=cls.continuous_env.observation_space.shape,
         action_dim=cls.continuous_env.action_space.low.size,
         max_action=1.,
         units=[4, 4])
     cls.const_std = 0.1
     cls.policy_fixed_sigma = GaussianActor(
         state_shape=cls.continuous_env.observation_space.shape,
         action_dim=cls.continuous_env.action_space.low.size,
         max_action=1.,
         units=[4, 4],
         fix_std=True,
         const_std=cls.const_std)
Пример #4
0
 def setUpClass(cls):
     super().setUpClass()
     cls.policy = GaussianActor(
         state_shape=cls.continuous_env.observation_space.shape,
         action_dim=cls.continuous_env.action_space.low.size,
         max_action=1.,
         units=[4, 4])
Пример #5
0
 def _setup_actor(self,
                  state_shape,
                  action_dim,
                  actor_units,
                  lr,
                  max_action=1.):
     self.actor = GaussianActor(state_shape,
                                action_dim,
                                max_action,
                                squash=True,
                                units=actor_units)
     self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
Пример #6
0
    def __init__(self,
                 state_shape,
                 action_dim,
                 name="SAC",
                 max_action=1.,
                 lr=3e-4,
                 actor_units=[256, 256],
                 tau=0.005,
                 scale_reward=5.,
                 n_warmup=int(1e4),
                 memory_capacity=int(1e6),
                 **kwargs):
        super().__init__(name=name,
                         memory_capacity=memory_capacity,
                         n_warmup=n_warmup,
                         **kwargs)

        self.actor = GaussianActor(state_shape,
                                   action_dim,
                                   max_action,
                                   squash=True,
                                   tanh_mean=False,
                                   tanh_std=False)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

        self.vf = CriticV(state_shape)
        self.vf_target = CriticV(state_shape)
        update_target_variables(self.vf_target.weights,
                                self.vf.weights,
                                tau=1.)
        self.vf_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

        self.qf1 = CriticQ(state_shape, action_dim, name="qf1")
        self.qf2 = CriticQ(state_shape, action_dim, name="qf2")
        self.qf1_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.qf2_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

        # Set hyper-parameters
        self.tau = tau
        self.scale_reward = scale_reward
Пример #7
0
class VPG(OnPolicyAgent):
    def __init__(self,
                 state_shape,
                 action_dim,
                 is_discrete,
                 actor=None,
                 critic=None,
                 actor_critic=None,
                 max_action=1.,
                 actor_units=[256, 256],
                 critic_units=[256, 256],
                 lr_actor=1e-3,
                 lr_critic=3e-3,
                 fix_std=False,
                 const_std=0.3,
                 hidden_activation_actor="relu",
                 hidden_activation_critic="relu",
                 name="VPG",
                 **kwargs):
        super().__init__(name=name, **kwargs)
        self._is_discrete = is_discrete

        # TODO: clean codes
        if actor_critic is not None:
            self.actor_critic = actor_critic
            self.actor_critic_optimizer = tf.keras.optimizers.Adam(
                learning_rate=lr_actor)
            self.actor = None
            self.critic = None
        else:
            self.actor_critic = None
            if actor is None:
                if is_discrete:
                    self.actor = CategoricalActor(state_shape, action_dim,
                                                  actor_units)
                else:
                    self.actor = GaussianActor(
                        state_shape,
                        action_dim,
                        max_action,
                        actor_units,
                        hidden_activation=hidden_activation_actor,
                        fix_std=fix_std,
                        const_std=const_std,
                        state_independent_std=True)
            else:
                self.actor = actor
            if critic is None:
                self.critic = CriticV(
                    state_shape,
                    critic_units,
                    hidden_activation=hidden_activation_critic)
            else:
                self.critic = critic
            self.actor_optimizer = tf.keras.optimizers.Adam(
                learning_rate=lr_actor)
            self.critic_optimizer = tf.keras.optimizers.Adam(
                learning_rate=lr_critic)

        # This is used to check if input state to `get_action` is multiple (batch) or single
        self._state_ndim = np.array(state_shape).shape[0]

    def get_action(self, state, test=False):
        if isinstance(state, LazyFrames):
            state = np.array(state)
        msg = "Input instance should be np.ndarray, not {}".format(type(state))
        assert isinstance(state, np.ndarray), msg

        is_single_input = state.ndim == self._state_ndim
        if is_single_input:
            state = np.expand_dims(state, axis=0).astype(np.float32)
        action, logp, _ = self._get_action_body(state, test)

        if is_single_input:
            return action.numpy()[0], logp.numpy()
        else:
            return action.numpy(), logp.numpy()

    def get_action_and_val(self, state, test=False):
        if isinstance(state, LazyFrames):
            state = np.array(state)
        is_single_input = state.ndim == self._state_ndim
        if is_single_input:
            state = np.expand_dims(state, axis=0).astype(np.float32)

        action, logp, v = self._get_action_logp_v_body(state, test)

        if is_single_input:
            v = v[0]
            action = action[0]

        return action.numpy(), logp.numpy(), v.numpy()

    @tf.function
    def _get_action_logp_v_body(self, state, test):
        if self.actor_critic:
            return self.actor_critic(state, test)
        else:
            action, logp, _ = self.actor(state, test)
            v = self.critic(state)
            return action, logp, v

    @tf.function
    def _get_action_body(self, state, test):
        if self.actor_critic is not None:
            action, logp, param = self.actor_critic(state, test)
            return action, logp, param
        else:
            return self.actor(state, test)

    def train(self, states, actions, advantages, logp_olds, returns):
        # Train actor and critic
        actor_loss, logp_news = self._train_actor_body(states, actions,
                                                       advantages, logp_olds)
        critic_loss = self._train_critic_body(states, returns)
        # Visualize results in TensorBoard
        tf.summary.scalar(name=self.policy_name + "/actor_loss",
                          data=actor_loss)
        tf.summary.scalar(name=self.policy_name + "/logp_max",
                          data=np.max(logp_news))
        tf.summary.scalar(name=self.policy_name + "/logp_min",
                          data=np.min(logp_news))
        tf.summary.scalar(name=self.policy_name + "/logp_mean",
                          data=np.mean(logp_news))
        tf.summary.scalar(name=self.policy_name + "/adv_max",
                          data=np.max(advantages))
        tf.summary.scalar(name=self.policy_name + "/adv_min",
                          data=np.min(advantages))
        tf.summary.scalar(name=self.policy_name + "/kl",
                          data=tf.reduce_mean(logp_olds - logp_news))
        tf.summary.scalar(name=self.policy_name + "/critic_loss",
                          data=critic_loss)
        return actor_loss, critic_loss

    @tf.function
    def _train_actor_body(self, states, actions, advantages, logp_olds):
        with tf.device(self.device):
            with tf.GradientTape() as tape:
                log_probs = self.actor.compute_log_probs(states, actions)
                weights = tf.stop_gradient(tf.squeeze(advantages))
                # + lambda * entropy
                actor_loss = tf.reduce_mean(-log_probs * weights)
            actor_grads = tape.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grads, self.actor.trainable_variables))

        return actor_loss, log_probs

    @tf.function
    def _train_critic_body(self, states, returns):
        with tf.device(self.device):
            # Train baseline
            with tf.GradientTape() as tape:
                current_V = self.critic(states)
                td_errors = tf.squeeze(returns) - current_V
                critic_loss = tf.reduce_mean(0.5 * tf.square(td_errors))
            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

        return critic_loss
Пример #8
0
class VPG(OnPolicyAgent):
    def __init__(self,
                 state_shape,
                 action_dim,
                 is_discrete,
                 max_action=1.,
                 actor_units=[256, 256],
                 critic_units=[256, 256],
                 lr_actor=1e-3,
                 lr_critic=3e-3,
                 fix_std=False,
                 tanh_std=False,
                 const_std=0.3,
                 name="VPG",
                 **kwargs):
        super().__init__(name=name, **kwargs)
        self._is_discrete = is_discrete
        if is_discrete:
            self.actor = CategoricalActor(state_shape, action_dim, actor_units)
        else:
            self.actor = GaussianActor(state_shape,
                                       action_dim,
                                       max_action,
                                       actor_units,
                                       fix_std=fix_std,
                                       tanh_std=tanh_std,
                                       const_std=const_std)
        self.critic = CriticV(state_shape, critic_units)
        self._action_dim = action_dim
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_actor)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            learning_rate=lr_critic)

    def get_action(self, state, test=False):
        assert isinstance(state, np.ndarray)

        single_input = state.ndim == 1
        if single_input:
            state = np.expand_dims(state, axis=0).astype(np.float32)
        action, logp_pi = self._get_action_body(state, test)

        if single_input:
            return action.numpy()[0], logp_pi.numpy()
        else:
            return action.numpy(), logp_pi.numpy()

    def get_action_and_val(self, state, test=False):
        single_input = state.ndim == 1
        if single_input:
            state = np.expand_dims(state, axis=0).astype(np.float32)
        action, logp_pi = self.get_action(state, test)
        val = self.critic(state)
        if single_input:
            val = val[0]
            action = action[0]
        return action, logp_pi, val.numpy()

    @tf.function
    def _get_action_body(self, state, test):
        return self.actor(state, test)

    def train_actor(self, states, actions, advantages, logp_olds):
        actor_loss, log_probs = self._train_actor_body(states, actions,
                                                       advantages)
        tf.summary.scalar(name=self.policy_name + "/actor_loss",
                          data=actor_loss)
        tf.summary.scalar(name=self.policy_name + "/logp_max",
                          data=np.max(log_probs))
        tf.summary.scalar(name=self.policy_name + "/logp_min",
                          data=np.min(log_probs))
        tf.summary.scalar(name=self.policy_name + "/logp_mean",
                          data=np.mean(log_probs))
        tf.summary.scalar(name=self.policy_name + "/adv_max",
                          data=np.max(advantages))
        tf.summary.scalar(name=self.policy_name + "/adv_min",
                          data=np.min(advantages))
        # TODO: Compute KL divergence and output it
        return actor_loss

    def train_critic(self, states, returns):
        critic_loss = self._train_critic_body(states, returns)
        tf.summary.scalar(name=self.policy_name + "/critic_loss",
                          data=critic_loss)
        return critic_loss

    @tf.function
    def _train_actor_body(self, states, actions, advantages):
        with tf.device(self.device):
            # Train policy
            with tf.GradientTape() as tape:
                log_probs = self.actor.compute_log_probs(states, actions)
                weights = tf.stop_gradient(tf.squeeze(advantages))
                # + lambda * entropy
                actor_loss = tf.reduce_mean(-log_probs * weights)
            actor_grad = tape.gradient(actor_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

        return actor_loss, log_probs

    @tf.function
    def _train_critic_body(self, states, returns):
        with tf.device(self.device):
            # Train baseline
            with tf.GradientTape() as tape:
                current_V = self.critic(states)
                td_errors = tf.squeeze(returns) - current_V
                critic_loss = tf.reduce_mean(0.5 * tf.square(td_errors))
            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

        return critic_loss