def __init__(self, state_shape, action_dim, is_discrete, actor=None, critic=None, actor_critic=None, max_action=1., actor_units=[256, 256], critic_units=[256, 256], lr_actor=1e-3, lr_critic=3e-3, fix_std=False, const_std=0.3, hidden_activation_actor="relu", hidden_activation_critic="relu", name="VPG", **kwargs): super().__init__(name=name, **kwargs) self._is_discrete = is_discrete # TODO: clean codes if actor_critic is not None: self.actor_critic = actor_critic self.actor_critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_actor) self.actor = None self.critic = None else: self.actor_critic = None if actor is None: if is_discrete: self.actor = CategoricalActor(state_shape, action_dim, actor_units) else: self.actor = GaussianActor( state_shape, action_dim, max_action, actor_units, hidden_activation=hidden_activation_actor, fix_std=fix_std, const_std=const_std, state_independent_std=True) else: self.actor = actor if critic is None: self.critic = CriticV( state_shape, critic_units, hidden_activation=hidden_activation_critic) else: self.critic = critic self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_actor) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_critic) # This is used to check if input state to `get_action` is multiple (batch) or single self._state_ndim = np.array(state_shape).shape[0]
def __init__(self, state_shape, action_dim, is_discrete, max_action=1., actor_units=[256, 256], critic_units=[256, 256], lr_actor=1e-3, lr_critic=3e-3, fix_std=False, tanh_std=False, const_std=0.3, name="VPG", **kwargs): super().__init__(name=name, **kwargs) self._is_discrete = is_discrete if is_discrete: self.actor = CategoricalActor(state_shape, action_dim, actor_units) else: self.actor = GaussianActor(state_shape, action_dim, max_action, actor_units, fix_std=fix_std, tanh_std=tanh_std, const_std=const_std) self.critic = CriticV(state_shape, critic_units) self._action_dim = action_dim self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_actor) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_critic)
def setUpClass(cls): super().setUpClass() cls.policy = GaussianActor( state_shape=cls.continuous_env.observation_space.shape, action_dim=cls.continuous_env.action_space.low.size, max_action=1., units=[4, 4]) cls.const_std = 0.1 cls.policy_fixed_sigma = GaussianActor( state_shape=cls.continuous_env.observation_space.shape, action_dim=cls.continuous_env.action_space.low.size, max_action=1., units=[4, 4], fix_std=True, const_std=cls.const_std)
def setUpClass(cls): super().setUpClass() cls.policy = GaussianActor( state_shape=cls.continuous_env.observation_space.shape, action_dim=cls.continuous_env.action_space.low.size, max_action=1., units=[4, 4])
def _setup_actor(self, state_shape, action_dim, actor_units, lr, max_action=1.): self.actor = GaussianActor(state_shape, action_dim, max_action, squash=True, units=actor_units) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
def __init__(self, state_shape, action_dim, name="SAC", max_action=1., lr=3e-4, actor_units=[256, 256], tau=0.005, scale_reward=5., n_warmup=int(1e4), memory_capacity=int(1e6), **kwargs): super().__init__(name=name, memory_capacity=memory_capacity, n_warmup=n_warmup, **kwargs) self.actor = GaussianActor(state_shape, action_dim, max_action, squash=True, tanh_mean=False, tanh_std=False) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) self.vf = CriticV(state_shape) self.vf_target = CriticV(state_shape) update_target_variables(self.vf_target.weights, self.vf.weights, tau=1.) self.vf_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) self.qf1 = CriticQ(state_shape, action_dim, name="qf1") self.qf2 = CriticQ(state_shape, action_dim, name="qf2") self.qf1_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) self.qf2_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) # Set hyper-parameters self.tau = tau self.scale_reward = scale_reward
class VPG(OnPolicyAgent): def __init__(self, state_shape, action_dim, is_discrete, actor=None, critic=None, actor_critic=None, max_action=1., actor_units=[256, 256], critic_units=[256, 256], lr_actor=1e-3, lr_critic=3e-3, fix_std=False, const_std=0.3, hidden_activation_actor="relu", hidden_activation_critic="relu", name="VPG", **kwargs): super().__init__(name=name, **kwargs) self._is_discrete = is_discrete # TODO: clean codes if actor_critic is not None: self.actor_critic = actor_critic self.actor_critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_actor) self.actor = None self.critic = None else: self.actor_critic = None if actor is None: if is_discrete: self.actor = CategoricalActor(state_shape, action_dim, actor_units) else: self.actor = GaussianActor( state_shape, action_dim, max_action, actor_units, hidden_activation=hidden_activation_actor, fix_std=fix_std, const_std=const_std, state_independent_std=True) else: self.actor = actor if critic is None: self.critic = CriticV( state_shape, critic_units, hidden_activation=hidden_activation_critic) else: self.critic = critic self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_actor) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_critic) # This is used to check if input state to `get_action` is multiple (batch) or single self._state_ndim = np.array(state_shape).shape[0] def get_action(self, state, test=False): if isinstance(state, LazyFrames): state = np.array(state) msg = "Input instance should be np.ndarray, not {}".format(type(state)) assert isinstance(state, np.ndarray), msg is_single_input = state.ndim == self._state_ndim if is_single_input: state = np.expand_dims(state, axis=0).astype(np.float32) action, logp, _ = self._get_action_body(state, test) if is_single_input: return action.numpy()[0], logp.numpy() else: return action.numpy(), logp.numpy() def get_action_and_val(self, state, test=False): if isinstance(state, LazyFrames): state = np.array(state) is_single_input = state.ndim == self._state_ndim if is_single_input: state = np.expand_dims(state, axis=0).astype(np.float32) action, logp, v = self._get_action_logp_v_body(state, test) if is_single_input: v = v[0] action = action[0] return action.numpy(), logp.numpy(), v.numpy() @tf.function def _get_action_logp_v_body(self, state, test): if self.actor_critic: return self.actor_critic(state, test) else: action, logp, _ = self.actor(state, test) v = self.critic(state) return action, logp, v @tf.function def _get_action_body(self, state, test): if self.actor_critic is not None: action, logp, param = self.actor_critic(state, test) return action, logp, param else: return self.actor(state, test) def train(self, states, actions, advantages, logp_olds, returns): # Train actor and critic actor_loss, logp_news = self._train_actor_body(states, actions, advantages, logp_olds) critic_loss = self._train_critic_body(states, returns) # Visualize results in TensorBoard tf.summary.scalar(name=self.policy_name + "/actor_loss", data=actor_loss) tf.summary.scalar(name=self.policy_name + "/logp_max", data=np.max(logp_news)) tf.summary.scalar(name=self.policy_name + "/logp_min", data=np.min(logp_news)) tf.summary.scalar(name=self.policy_name + "/logp_mean", data=np.mean(logp_news)) tf.summary.scalar(name=self.policy_name + "/adv_max", data=np.max(advantages)) tf.summary.scalar(name=self.policy_name + "/adv_min", data=np.min(advantages)) tf.summary.scalar(name=self.policy_name + "/kl", data=tf.reduce_mean(logp_olds - logp_news)) tf.summary.scalar(name=self.policy_name + "/critic_loss", data=critic_loss) return actor_loss, critic_loss @tf.function def _train_actor_body(self, states, actions, advantages, logp_olds): with tf.device(self.device): with tf.GradientTape() as tape: log_probs = self.actor.compute_log_probs(states, actions) weights = tf.stop_gradient(tf.squeeze(advantages)) # + lambda * entropy actor_loss = tf.reduce_mean(-log_probs * weights) actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grads, self.actor.trainable_variables)) return actor_loss, log_probs @tf.function def _train_critic_body(self, states, returns): with tf.device(self.device): # Train baseline with tf.GradientTape() as tape: current_V = self.critic(states) td_errors = tf.squeeze(returns) - current_V critic_loss = tf.reduce_mean(0.5 * tf.square(td_errors)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) return critic_loss
class VPG(OnPolicyAgent): def __init__(self, state_shape, action_dim, is_discrete, max_action=1., actor_units=[256, 256], critic_units=[256, 256], lr_actor=1e-3, lr_critic=3e-3, fix_std=False, tanh_std=False, const_std=0.3, name="VPG", **kwargs): super().__init__(name=name, **kwargs) self._is_discrete = is_discrete if is_discrete: self.actor = CategoricalActor(state_shape, action_dim, actor_units) else: self.actor = GaussianActor(state_shape, action_dim, max_action, actor_units, fix_std=fix_std, tanh_std=tanh_std, const_std=const_std) self.critic = CriticV(state_shape, critic_units) self._action_dim = action_dim self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_actor) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_critic) def get_action(self, state, test=False): assert isinstance(state, np.ndarray) single_input = state.ndim == 1 if single_input: state = np.expand_dims(state, axis=0).astype(np.float32) action, logp_pi = self._get_action_body(state, test) if single_input: return action.numpy()[0], logp_pi.numpy() else: return action.numpy(), logp_pi.numpy() def get_action_and_val(self, state, test=False): single_input = state.ndim == 1 if single_input: state = np.expand_dims(state, axis=0).astype(np.float32) action, logp_pi = self.get_action(state, test) val = self.critic(state) if single_input: val = val[0] action = action[0] return action, logp_pi, val.numpy() @tf.function def _get_action_body(self, state, test): return self.actor(state, test) def train_actor(self, states, actions, advantages, logp_olds): actor_loss, log_probs = self._train_actor_body(states, actions, advantages) tf.summary.scalar(name=self.policy_name + "/actor_loss", data=actor_loss) tf.summary.scalar(name=self.policy_name + "/logp_max", data=np.max(log_probs)) tf.summary.scalar(name=self.policy_name + "/logp_min", data=np.min(log_probs)) tf.summary.scalar(name=self.policy_name + "/logp_mean", data=np.mean(log_probs)) tf.summary.scalar(name=self.policy_name + "/adv_max", data=np.max(advantages)) tf.summary.scalar(name=self.policy_name + "/adv_min", data=np.min(advantages)) # TODO: Compute KL divergence and output it return actor_loss def train_critic(self, states, returns): critic_loss = self._train_critic_body(states, returns) tf.summary.scalar(name=self.policy_name + "/critic_loss", data=critic_loss) return critic_loss @tf.function def _train_actor_body(self, states, actions, advantages): with tf.device(self.device): # Train policy with tf.GradientTape() as tape: log_probs = self.actor.compute_log_probs(states, actions) weights = tf.stop_gradient(tf.squeeze(advantages)) # + lambda * entropy actor_loss = tf.reduce_mean(-log_probs * weights) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) return actor_loss, log_probs @tf.function def _train_critic_body(self, states, returns): with tf.device(self.device): # Train baseline with tf.GradientTape() as tape: current_V = self.critic(states) td_errors = tf.squeeze(returns) - current_V critic_loss = tf.reduce_mean(0.5 * tf.square(td_errors)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) return critic_loss