def __init__(self, state_dim, action_dim, name="SAC", max_action=1., lr=3e-4, actor_units=[256, 256], tau=0.005, scale_reward=5., n_warmup=int(1e4), memory_capacity=int(1e6), **kwargs): super().__init__(name=name, memory_capacity=memory_capacity, n_warmup=n_warmup, **kwargs) self.actor = GaussianActor(state_dim, action_dim, max_action) self.actor_optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.vf = CriticV(state_dim) self.vf_target = CriticV(state_dim) update_target_variables(self.vf_target.weights, self.vf.weights, tau=1.) self.vf_optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.qf1 = CriticQ(state_dim, action_dim, name="qf1") self.qf2 = CriticQ(state_dim, action_dim, name="qf2") self.qf1_optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.qf2_optimizer = tf.train.AdamOptimizer(learning_rate=lr) # Set hyper-parameters self.tau = tau self.scale_reward = scale_reward
def _setup_critic_v(self, state_shape, critic_units, lr): self.vf = CriticV(state_shape, critic_units) self.vf_target = CriticV(state_shape, critic_units) update_target_variables(self.vf_target.weights, self.vf.weights, tau=1.) self.vf_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
def __init__(self, state_shape, action_dim, name="TD3", actor_update_freq=2, policy_noise=0.2, noise_clip=0.5, actor_units=[400, 300], critic_units=[400, 300], lr_critic=0.001, **kwargs): super().__init__(name=name, state_shape=state_shape, action_dim=action_dim, actor_units=actor_units, critic_units=critic_units, lr_critic=lr_critic, **kwargs) self.critic = Critic(state_shape, action_dim, critic_units) self.critic_target = Critic(state_shape, action_dim, critic_units) update_target_variables(self.critic_target.weights, self.critic.weights, tau=1.) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_critic) self._policy_noise = policy_noise self._noise_clip = noise_clip self._actor_update_freq = actor_update_freq self._it = tf.Variable(0, dtype=tf.int32)
def _update_critic(self, states, actions, next_states, rewards, dones, weights): with tf.device(self.device): assert len(dones.shape) == 2 assert len(rewards.shape) == 2 rewards = tf.squeeze(rewards, axis=1) dones = tf.squeeze(dones, axis=1) not_dones = 1. - tf.cast(dones, dtype=tf.float32) with tf.GradientTape(persistent=True) as tape: # Compute loss of critic Q next_actions, next_logps = self.actor(next_states) next_target_q1 = tf.stop_gradient(self.qf1_target(next_states, next_actions)) next_target_q2 = tf.stop_gradient(self.qf2_target(next_states, next_actions)) min_next_target_q = tf.minimum(next_target_q1, next_target_q2) target_q = tf.stop_gradient( rewards + not_dones * self.discount * (min_next_target_q - self.alpha * next_logps)) current_q1 = self.qf1(states, actions) current_q2 = self.qf2(states, actions) td_loss_q1 = tf.reduce_mean((target_q - current_q1) ** 2) td_loss_q2 = tf.reduce_mean((target_q - current_q2) ** 2) # Eq.(6) q1_grad = tape.gradient(td_loss_q1, self.qf1.trainable_variables) self.qf1_optimizer.apply_gradients( zip(q1_grad, self.qf1.trainable_variables)) q2_grad = tape.gradient(td_loss_q2, self.qf2.trainable_variables) self.qf2_optimizer.apply_gradients( zip(q2_grad, self.qf2.trainable_variables)) update_target_variables(self.qf1_target.weights, self.qf1.weights, self.tau) update_target_variables(self.qf2_target.weights, self.qf2.weights, self.tau) return td_loss_q1 + td_loss_q2, td_loss_q1
def _train_body(self, states, actions, next_states, rewards, done, weights): with tf.device(self.device): with tf.GradientTape() as tape: td_errors = self._compute_td_error_body( states, actions, next_states, rewards, done) critic_loss = tf.reduce_mean( huber_loss(td_errors, delta=self.max_grad) * weights) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) critic_grad = [(tf.clip_by_value( grad, tf.constant(-self.max_grad, dtype=tf.float32), tf.constant(self.max_grad, dtype=tf.float32))) for grad in critic_grad] self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) with tf.GradientTape() as tape: next_action = self.actor(states) actor_loss = -tf.reduce_mean(self.critic([states, next_action ])) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) # Update target networks update_target_variables(self.critic_target.weights, self.critic.weights, self.tau) update_target_variables(self.actor_target.weights, self.actor.weights, self.tau) return actor_loss, critic_loss, td_errors
def _train_body(self, states, actions, next_states, rewards, done, weights): with tf.device(self.device): with tf.GradientTape() as tape: td_errors = self._compute_td_error_body( states, actions, next_states, rewards, done) critic_loss = tf.reduce_mean( tf.square(td_errors) * weights * 0.5) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) with tf.GradientTape() as tape: next_action = self.actor(states) actor_loss = -tf.reduce_mean(self.critic([states, next_action ])) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) # Update target networks update_target_variables(self.critic_target.weights, self.critic.weights, self.tau) update_target_variables(self.actor_target.weights, self.actor.weights, self.tau) return actor_loss, critic_loss, td_errors
def __init__(self, state_shape, action_dim, name="TD3", actor_update_freq=2, policy_noise=0.2, noise_clip=0.5, critic_units=(400, 300), **kwargs): super().__init__(name=name, state_shape=state_shape, action_dim=action_dim, **kwargs) self.critic = Critic(state_shape, action_dim, critic_units) self.critic_target = Critic(state_shape, action_dim, critic_units) update_target_variables(self.critic_target.weights, self.critic.weights, tau=1.) self._policy_noise = policy_noise self._noise_clip = noise_clip self._actor_update_freq = actor_update_freq self._it = tf.Variable(0, dtype=tf.int32)
def _update_encoder(self, obses_anchor, obses_negative): with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: # Compute loss of CURL z_anchor = self._encoder(obses_anchor) z_negatives = self._encoder_target(obses_negative) # Compute similarities with bilinear products logits = tf.matmul( z_anchor, tf.matmul(self._curl_w, tf.transpose(z_negatives, [1, 0]))) logits -= tf.reduce_max( logits, axis=-1, keepdims=True) # (batch_size, batch_size) curl_loss = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( tf.range(self.batch_size), logits, from_logits=True)) # Eq.4 curl_grads = tape.gradient(curl_loss, [self._curl_w] + self._encoder.trainable_variables) self._encoder_optimizer.apply_gradients( zip(curl_grads, [self._curl_w] + self._encoder.trainable_variables)) update_target_variables(self._encoder_target.weights, self._encoder.weights, self._tau_encoder) return curl_loss, tf.reduce_mean(tf.abs(self._curl_w)), tf.reduce_mean( tf.abs(z_anchor)), tf.reduce_mean(logits)
def _train_body(self, states, actions, next_states, rewards, done, weights): with tf.device(self.device): with tf.GradientTape() as tape: td_error1, td_error2 = self._compute_td_error_body( states, actions, next_states, rewards, done) critic_loss = tf.reduce_mean(huber_loss(td_error1, delta=self.max_grad) * weights) + \ tf.reduce_mean(huber_loss(td_error2, delta=self.max_grad) * weights) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) self._it.assign_add(1) with tf.GradientTape() as tape: next_actions = self.actor(states) actor_loss = - \ tf.reduce_mean(self.critic([states, next_actions])) if tf.math.equal(self._it % self._actor_update_freq, 0): actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) # Update target networks update_target_variables(self.critic_target.weights, self.critic.weights, self.tau) update_target_variables(self.actor_target.weights, self.actor.weights, self.tau) return actor_loss, critic_loss, tf.abs(td_error1) + tf.abs( td_error2)
def train(self, states, actions, next_states, rewards, done, weights=None): if weights is None: weights = np.ones_like(rewards) td_errors, q_func_loss = self._train_body(states, actions, next_states, rewards, done, weights) tf.summary.scalar(name=self.policy_name + "/q_func_Loss", data=q_func_loss) # TODO: Remove following by using tf.global_step self.n_update += 1 # Update target networks if self.n_update % self.target_replace_interval == 0: update_target_variables(self.q_func_target.weights, self.q_func.weights, tau=1.) # Update exploration rate self.epsilon = max( self.epsilon - self.epsilon_decay_rate * self.update_interval, self.epsilon_min) tf.summary.scalar(name=self.policy_name + "/epsilon", data=self.epsilon) return td_errors
def _setup_critic_q(self, state_shape, action_dim, critic_units, lr): self.qf1 = self.critic_fn(state_shape, action_dim, critic_units, name="qf1") self.qf2 = self.critic_fn(state_shape, action_dim, critic_units, name="qf2") self.qf1_target = self.critic_fn(state_shape, action_dim, critic_units, name="qf1_target") self.qf2_target = self.critic_fn(state_shape, action_dim, critic_units, name="qf2_target") update_target_variables(self.qf1_target.weights, self.qf1.weights, tau=1.) update_target_variables(self.qf2_target.weights, self.qf2.weights, tau=1.) self.qf1_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) self.qf2_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
def _update_encoder(self, obses): with tf.device(self.device): with tf.GradientTape(persistent=True) as tape: # Compute loss of critic Q obs_features = self._encoder(obses, stop_q_grad=self._stop_q_grad) # Compute loss of AE rec_obses = self._decoder(obs_features) true_obses = preprocess_img(obses) rec_loss = tf.reduce_mean( tf.keras.losses.MSE(true_obses, rec_obses)) latent_loss = tf.reduce_mean( 0.5 * tf.reduce_sum(tf.math.pow(obs_features, 2), axis=1)) ae_loss = rec_loss + self._lambda_latent_val * latent_loss encoder_grads = tape.gradient(ae_loss, self._encoder.trainable_variables) self._encoder_optimizer.apply_gradients( zip(encoder_grads, self._encoder.trainable_variables)) decoder_grads = tape.gradient(ae_loss, self._decoder.trainable_variables) self._encoder_optimizer.apply_gradients( zip(decoder_grads, self._decoder.trainable_variables)) update_target_variables(self._encoder_target.weights, self._encoder.weights, self._tau_encoder) return rec_loss, latent_loss
def __init__(self, action_dim, obs_shape=(84, 84, 9), n_conv_layers=4, n_conv_filters=32, feature_dim=50, tau_encoder=0.05, tau_critic=0.01, auto_alpha=True, lr_sac=1e-3, lr_encoder=1e-3, lr_decoder=1e-3, update_critic_target_freq=2, update_actor_freq=2, lr_alpha=1e-4, init_temperature=0.1, stop_q_grad=False, lambda_latent_val=1e-06, decoder_weight_lambda=1e-07, skip_making_decoder=False, name="SACAE", **kwargs): super().__init__(state_shape=(feature_dim, ), action_dim=action_dim, name=name, lr=lr_sac, lr_alpha=lr_alpha, tau=tau_critic, auto_alpha=auto_alpha, init_temperature=init_temperature, **kwargs) self._encoder = Encoder(obs_shape=obs_shape, feature_dim=feature_dim, n_conv_layers=n_conv_layers, n_conv_filters=n_conv_filters, name="encoder") self._encoder_target = Encoder(obs_shape=obs_shape, feature_dim=feature_dim, n_conv_layers=n_conv_layers, n_conv_filters=n_conv_filters, name="encoder_target") update_target_variables(self._encoder_target.weights, self._encoder.weights, tau=1.) self._encoder_optimizer = tf.keras.optimizers.Adam(lr=lr_encoder) if not skip_making_decoder: self._decoder = Decoder() self._lambda_latent_val = lambda_latent_val self._decoder_optimizer = tfa.optimizers.AdamW( learning_rate=lr_decoder, weight_decay=decoder_weight_lambda) self._stop_q_grad = stop_q_grad self._input_img_size = obs_shape[0] self._tau_encoder = tau_encoder self._n_update = 0 self._update_critic_target_freq = update_critic_target_freq self._update_actor_freq = update_actor_freq self._feature_dim = feature_dim self.state_ndim = 3
def _setup_critic_q(self, state_shape, action_dim, lr): self.qf1 = CriticQ(state_shape, action_dim, name="qf1") self.qf2 = CriticQ(state_shape, action_dim, name="qf2") update_target_variables(self.vf_target.weights, self.vf.weights, tau=1.) self.qf1_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) self.qf2_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
def train(self, states, actions, next_states, rewards, dones, weights=None): if weights is None: weights = np.ones_like(rewards) obses_anchor = random_crop(states, self._input_img_size) next_obses_anchor = random_crop(next_states, self._input_img_size) obses_negative = random_crop(states, self._input_img_size) # Update critic td_errors, qf_loss = self._update_critic(obses_anchor, actions, next_obses_anchor, rewards, dones, weights) tf.summary.scalar(name=self.policy_name + "/critic_loss", data=qf_loss) if self._n_update % self._update_critic_target_freq == 0: update_target_variables(self.qf1_target.weights, self.qf1.weights, self.tau) update_target_variables(self.qf2_target.weights, self.qf2.weights, self.tau) # Update actor if self._n_update % self._update_actor_freq == 0: obs_features = self._encoder(obses_anchor) actor_loss, logp_min, logp_max, logp_mean, alpha_loss = self._update_actor( obs_features) tf.summary.scalar(name=self.policy_name + "/actor_loss", data=actor_loss) tf.summary.scalar(name=self.policy_name + "/logp_min", data=logp_min) tf.summary.scalar(name=self.policy_name + "/logp_max", data=logp_max) tf.summary.scalar(name=self.policy_name + "/logp_mean", data=logp_mean) if self.auto_alpha: tf.summary.scalar(name=self.policy_name + "/log_ent", data=self.log_alpha) tf.summary.scalar(name=self.policy_name + "/logp_mean+target", data=logp_mean + self.target_alpha) tf.summary.scalar(name=self.policy_name + "/ent", data=self.alpha) tf.summary.scalar(name=self.policy_name + "/alpha_loss", data=alpha_loss) # Update encoder curl_loss, w, z_anchor, logits = self._update_encoder( obses_anchor, obses_negative) tf.summary.scalar(name="encoder/curl_loss", data=curl_loss) tf.summary.scalar(name="encoder/latent_vars", data=z_anchor) tf.summary.scalar(name="encoder/w", data=w) tf.summary.scalar(name="encoder/logits", data=logits) self._n_update += 1 return td_errors
def train(self, states, actions, next_states, rewards, dones, weights=None): if weights is None: weights = np.ones_like(rewards) # Update critic td_errors, qf_loss = self._update_critic(states, actions, next_states, rewards, dones, weights) tf.summary.scalar(name=self.policy_name + "/critic_loss", data=qf_loss) if self._n_update % self._update_critic_target_freq == 0: update_target_variables(self.qf1_target.weights, self.qf1.weights, self.tau) update_target_variables(self.qf2_target.weights, self.qf2.weights, self.tau) # Update actor if self._n_update % self._update_actor_freq == 0: obs_features = self._encoder(states) actor_loss, logp_min, logp_max, logp_mean, alpha_loss = self._update_actor( obs_features) tf.summary.scalar(name=self.policy_name + "/actor_loss", data=actor_loss) tf.summary.scalar(name=self.policy_name + "/logp_min", data=logp_min) tf.summary.scalar(name=self.policy_name + "/logp_max", data=logp_max) tf.summary.scalar(name=self.policy_name + "/logp_mean", data=logp_mean) if self.auto_alpha: tf.summary.scalar(name=self.policy_name + "/log_ent", data=self.log_alpha) tf.summary.scalar(name=self.policy_name + "/logp_mean+target", data=logp_mean + self.target_alpha) tf.summary.scalar(name=self.policy_name + "/ent", data=self.alpha) tf.summary.scalar(name=self.policy_name + "/alpha_loss", data=alpha_loss) # Update encoder/decoder rec_loss, latent_loss = self._update_encoder(states) tf.summary.scalar(name=self.policy_name + "/rec_loss", data=rec_loss) tf.summary.scalar(name=self.policy_name + "/latent_loss", data=latent_loss) self._n_update += 1 return qf_loss
def __init__(self, state_shape, action_dim, name="TD3", actor_update_freq=2, policy_noise=0.2, noise_clip=0.5, critic_units=(400, 300), **kwargs): """ Initialize TD3 Args: shate_shape (iterable of ints): Observation state shape action_dim (int): Action dimension name (str): Network name. The default is ``"TD3"``. actor_update_freq (int): Number of critic updates per one actor upate. policy_noise (float): noise_clip (float): critic_units (iterable of int): Numbers of units at hidden layer of critic. The default is ``(400, 300)`` max_action (float): Size of maximum action. (``-max_action`` <= action <= ``max_action``). The degault is ``1``. lr_actor (float): Learning rate for actor network. The default is ``0.001``. lr_critic (float): Learning rage for critic network. The default is ``0.001``. actor_units (iterable of int): Number of units at hidden layers of actor. sigma (float): Standard deviation of Gaussian noise. The default is ``0.1``. tau (float): Weight update ratio for target network. ``target = (1-tau)*target + tau*network`` The default is ``0.005``. n_warmup (int): Number of warmup steps before training. The default is ``1e4``. memory_capacity (int): Replay Buffer size. The default is ``1e4``. batch_size (int): Batch size. The default is ``256``. discount (float): Discount factor. The default is ``0.99``. max_grad (float): Maximum gradient. The default is ``10``. gpu (int): GPU id. ``-1`` disables GPU. The default is ``0``. """ super().__init__(name=name, state_shape=state_shape, action_dim=action_dim, **kwargs) self.critic = Critic(state_shape, action_dim, critic_units) self.critic_target = Critic(state_shape, action_dim, critic_units) update_target_variables(self.critic_target.weights, self.critic.weights, tau=1.) self._policy_noise = policy_noise self._noise_clip = noise_clip self._actor_update_freq = actor_update_freq self._it = tf.Variable(0, dtype=tf.int32)
def train(self, states, actions, next_states, rewards, done, weights=None): if weights is None: weights = np.ones_like(rewards) td_error, q_func_loss = self._train_body(states, actions, next_states, rewards, done, weights) tf.contrib.summary.scalar(name="QFuncLoss", tensor=q_func_loss, family="loss") # Remove following by using tf.global_step self.n_update += 1 # Update target networks if self.n_update % self.target_replace_interval == 0: update_target_variables(self.q_func_target.weights, self.q_func.weights, tau=1.) return td_error
def __init__(self, env, params, **kwargs): """Initializes a DDPG agent""" super().__init__(name=params["agent"]["name"], memory_capacity=params["agent"]["memory_capacity"], n_warmup=params["agent"]["n_warmup"], gpu=params["agent"]["gpu"], batch_size=params["agent"]["batch_size"], update_interval=params["agent"]["update_interval"], **kwargs) # Define and initialize Actor network self.actor = Actor(state_shape=env.observation_space.shape, action_space=env.action_space, params=params) self.actor_target = Actor(state_shape=env.observation_space.shape, action_space=env.action_space, params=params) self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=params["agent"]["lr_actor"]) update_target_variables(self.actor_target.weights, self.actor.weights, tau=1.) # Define and initialize Critic network self.critic = Critic(state_shape=env.observation_space.shape, action_dim=env.action_space.high.size, params=params) self.critic_target = Critic(state_shape=env.observation_space.shape, action_dim=env.action_space.high.size, params=params) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=params["agent"]["lr_critic"]) update_target_variables(self.critic_target.weights, self.critic.weights, tau=1.) # Set hyperparameters self.sigma = params["agent"]["sigma"] self.tau = params["agent"]["tau"] # in evaluation mode the action of the agent is deterministic, not stochastic. self.eval_mode = False
def _train_body(self, states, actions, next_states, rewards, done, weights): with tf.device(self.device): with tf.GradientTape() as tape: td_error1, td_error2 = self._compute_td_error_body( states, actions, next_states, rewards, done) critic_loss = ( tf.reduce_mean( huber_loss(td_error1, delta=self.max_grad) * weights) + tf.reduce_mean( huber_loss(td_error2, delta=self.max_grad) * weights)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) self._it.assign_add(1) with tf.GradientTape() as tape: next_actions = self.actor(states) actor_loss = -tf.reduce_mean(self.critic(states, next_actions)) remainder = tf.math.mod(self._it, self._actor_update_freq) def optimize_actor(): actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) return self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) tf.cond(pred=tf.equal(remainder, 0), true_fn=optimize_actor, false_fn=tf.no_op) # Update target networks update_target_variables(self.critic_target.weights, self.critic.weights, self.tau) update_target_variables(self.actor_target.weights, self.actor.weights, self.tau) return actor_loss, critic_loss, tf.abs(td_error1) + tf.abs( td_error2)
def __init__(self, state_shape, action_dim, name="DDPG", max_action=1., lr_actor=0.001, lr_critic=0.001, actor_units=(400, 300), critic_units=(400, 300), sigma=0.1, tau=0.005, n_warmup=int(1e4), memory_capacity=int(1e6), **kwargs): super().__init__(name=name, memory_capacity=memory_capacity, n_warmup=n_warmup, **kwargs) # Define and initialize Actor network self.actor = Actor(state_shape, action_dim, max_action, actor_units) self.actor_target = Actor(state_shape, action_dim, max_action, actor_units) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr_actor) update_target_variables(self.actor_target.weights, self.actor.weights, tau=1.) # Define and initialize Critic network self.critic = Critic(state_shape, action_dim, critic_units) self.critic_target = Critic(state_shape, action_dim, critic_units) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=lr_critic) update_target_variables(self.critic_target.weights, self.critic.weights, tau=1.) # Set hyperparameters self.sigma = sigma self.tau = tau
def set_weights_fn(policy, weights): actor_weights, critic_weights, critic_target_weights = weights update_target_variables(policy.actor.weights, actor_weights, tau=1.) update_target_variables(policy.critic.weights, critic_weights, tau=1.) update_target_variables(policy.critic_target.weights, critic_target_weights, tau=1.)
def __init__(self, state_shape, action_dim, q_func=None, name="DQN", lr=0.001, units=[32, 32], epsilon=0.1, n_warmup=int(1e4), target_replace_interval=int(5e3), memory_capacity=int(1e6), enable_double_dqn=False, enable_dueling_dqn=False, **kwargs): super().__init__(name=name, memory_capacity=memory_capacity, n_warmup=n_warmup, **kwargs) q_func = q_func if q_func is not None else QFunc # Define and initialize Q-function network self.q_func = q_func(state_shape, action_dim, units) self.q_func_target = q_func(state_shape, action_dim, units) self.q_func_optimizer = tf.train.AdamOptimizer(learning_rate=lr) update_target_variables(self.q_func_target.weights, self.q_func.weights, tau=1.) self._action_dim = action_dim # Set hyperparameters self.epsilon = epsilon self.target_replace_interval = target_replace_interval self.n_update = 0 # DQN variants self._enable_double_dqn = enable_double_dqn self._enable_dueling_dqn = enable_dueling_dqn
def _train_body(self, states, actions, next_states, rewards, done, weights): with tf.device(self.device): with tf.GradientTape() as tape: td_error1, td_error2 = self._compute_td_error_body( states, actions, next_states, rewards, done) critic_loss = tf.reduce_mean( tf.square(td_error1) * weights * 0.5 + \ tf.square(td_error2) * weights * 0.5) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) actor_loss = None # TODO: Update actor and target networks at specified frequency # tf.assign(self._it, self._it+1) # if tf.mod(self._it, self._actor_update_freq) == 0: with tf.GradientTape() as tape: next_actions = self.actor(states) actor_loss = -tf.reduce_mean( self.critic([states, next_actions])) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) # Update target networks update_target_variables(self.critic_target.weights, self.critic.weights, self.tau) update_target_variables(self.actor_target.weights, self.actor.weights, self.tau) return actor_loss, critic_loss, np.abs(td_error1) + np.abs( td_error2)
def _train_body(self, states, actions, next_states, rewards, done, weights=None): with tf.device(self.device): batch_size = states.shape[0] not_dones = 1. - tf.cast(done, dtype=tf.float32) actions = tf.cast(actions, dtype=tf.int32) indices = tf.concat( values=[tf.expand_dims(tf.range(batch_size), axis=1), actions], axis=1) with tf.GradientTape(persistent=True) as tape: # Compute critic loss _, _, next_action_param = self.actor(next_states) next_action_prob = next_action_param["prob"] next_action_logp = tf.math.log(next_action_prob + 1e-8) next_q = tf.minimum(self.qf1_target(next_states), self.qf2_target(next_states)) target_q = tf.expand_dims(tf.einsum( 'ij,ij->i', next_action_prob, next_q - self.alpha * next_action_logp), axis=1) # Eq.(10) target_q = tf.stop_gradient(rewards + not_dones * self.discount * target_q) current_q1 = self.qf1(states) current_q2 = self.qf2(states) td_loss1 = tf.reduce_mean( huber_loss(target_q - tf.expand_dims( tf.gather_nd(current_q1, indices), axis=1), delta=self.max_grad)) td_loss2 = tf.reduce_mean( huber_loss(target_q - tf.expand_dims( tf.gather_nd(current_q2, indices), axis=1), delta=self.max_grad)) # Eq.(7) # Compute actor loss _, _, current_action_param = self.actor(states) current_action_prob = current_action_param["prob"] current_action_logp = tf.math.log(current_action_prob + 1e-8) policy_loss = tf.reduce_mean( tf.einsum( 'ij,ij->i', current_action_prob, self.alpha * current_action_logp - tf.stop_gradient( tf.minimum(current_q1, current_q2)))) # Eq.(12) mean_ent = tf.reduce_mean( tf.einsum('ij,ij->i', current_action_prob, current_action_logp)) * (-1) q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables) self.qf1_optimizer.apply_gradients( zip(q1_grad, self.qf1.trainable_variables)) q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables) self.qf2_optimizer.apply_gradients( zip(q2_grad, self.qf2.trainable_variables)) update_target_variables(self.qf1_target.weights, self.qf1.weights, tau=self.tau) update_target_variables(self.qf2_target.weights, self.qf2.weights, tau=self.tau) actor_grad = tape.gradient(policy_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) return (td_loss1 + td_loss2) / 2., policy_loss, mean_ent, \ tf.reduce_min(current_action_logp), tf.reduce_max(current_action_logp)
def __init__(self, state_shape, action_dim, q_func=None, name="DQN", lr=0.001, units=[32, 32], epsilon=0.1, epsilon_min=None, epsilon_decay_step=int(1e6), n_warmup=int(1e4), target_replace_interval=int(5e3), memory_capacity=int(1e6), optimizer=None, enable_double_dqn=False, enable_dueling_dqn=False, enable_noisy_dqn=False, enable_categorical_dqn=False, **kwargs): super().__init__(name=name, memory_capacity=memory_capacity, n_warmup=n_warmup, **kwargs) q_func = q_func if q_func is not None else QFunc # Define and initialize Q-function network kwargs_dqn = { "state_shape": state_shape, "action_dim": action_dim, "units": units, "enable_dueling_dqn": enable_dueling_dqn, "enable_noisy_dqn": enable_noisy_dqn, "enable_categorical_dqn": enable_categorical_dqn } self.q_func = q_func(**kwargs_dqn) self.q_func_target = q_func(**kwargs_dqn) self.q_func_optimizer = optimizer if optimizer is not None else \ tf.keras.optimizers.Adam(learning_rate=lr) update_target_variables(self.q_func_target.weights, self.q_func.weights, tau=1.) self._action_dim = action_dim # This is used to check if input state to `get_action` is multiple (batch) or single self._state_ndim = np.array(state_shape).shape[0] # Distributional DQN if enable_categorical_dqn: self._v_max, self._v_min = 10., -10. self._delta_z = (self._v_max - self._v_min) / \ (self.q_func._n_atoms - 1) self._z_list = tf.constant([ self._v_min + i * self._delta_z for i in range(self.q_func._n_atoms) ], dtype=tf.float32) self._z_list_broadcasted = tf.tile( tf.reshape(self._z_list, [1, self.q_func._n_atoms]), tf.constant([self._action_dim, 1])) # Set hyper-parameters if epsilon_min is not None and not enable_noisy_dqn: assert epsilon > epsilon_min self.epsilon_min = epsilon_min self.epsilon_decay_rate = (epsilon - epsilon_min) / epsilon_decay_step self.epsilon = max( epsilon - self.epsilon_decay_rate * self.n_warmup, self.epsilon_min) else: epsilon = epsilon if not enable_noisy_dqn else 0. self.epsilon = epsilon self.epsilon_min = epsilon self.epsilon_decay_rate = 0. self.target_replace_interval = target_replace_interval self.n_update = 0 # DQN variants self._enable_double_dqn = enable_double_dqn self._enable_noisy_dqn = enable_noisy_dqn self._enable_categorical_dqn = enable_categorical_dqn
def _train_body(self, states, actions, next_states, rewards, dones, weights): with tf.device(self.device): if tf.rank(rewards) == 2: rewards = tf.squeeze(rewards, axis=1) not_dones = 1. - tf.cast(dones, dtype=tf.float32) with tf.GradientTape(persistent=True) as tape: # Compute loss of critic Q current_q1 = self.qf1([states, actions]) current_q2 = self.qf2([states, actions]) vf_next_target = self.vf_target(next_states) target_q = tf.stop_gradient( rewards + not_dones * self.discount * vf_next_target) td_loss_q1 = tf.reduce_mean(huber_loss( target_q - current_q1, delta=self.max_grad) * weights) td_loss_q2 = tf.reduce_mean(huber_loss( target_q - current_q2, delta=self.max_grad) * weights) # Eq.(7) # Compute loss of critic V current_v = self.vf(states) sample_actions, logp, _ = self.actor(states) # Resample actions to update V current_q1 = self.qf1([states, sample_actions]) current_q2 = self.qf2([states, sample_actions]) current_min_q = tf.minimum(current_q1, current_q2) target_v = tf.stop_gradient( current_min_q - self.alpha * logp) td_errors = target_v - current_v td_loss_v = tf.reduce_mean( huber_loss(td_errors, delta=self.max_grad) * weights) # Eq.(5) # Compute loss of policy policy_loss = tf.reduce_mean( (self.alpha * logp - current_min_q) * weights) # Eq.(12) # Compute loss of temperature parameter for entropy if self.auto_alpha: alpha_loss = -tf.reduce_mean( (self.log_alpha * tf.stop_gradient(logp + self.target_alpha))) q1_grad = tape.gradient(td_loss_q1, self.qf1.trainable_variables) self.qf1_optimizer.apply_gradients( zip(q1_grad, self.qf1.trainable_variables)) q2_grad = tape.gradient(td_loss_q2, self.qf2.trainable_variables) self.qf2_optimizer.apply_gradients( zip(q2_grad, self.qf2.trainable_variables)) vf_grad = tape.gradient(td_loss_v, self.vf.trainable_variables) self.vf_optimizer.apply_gradients( zip(vf_grad, self.vf.trainable_variables)) update_target_variables( self.vf_target.weights, self.vf.weights, self.tau) actor_grad = tape.gradient( policy_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) if self.auto_alpha: alpha_grad = tape.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) self.alpha.assign(tf.exp(self.log_alpha)) del tape return td_errors, policy_loss, td_loss_v, td_loss_q1, tf.reduce_min(logp), tf.reduce_max(logp), tf.reduce_mean(logp)
def _train_body(self, states, actions, next_states, rewards, done, weights=None): with tf.device(self.device): rewards = tf.squeeze(rewards, axis=1) not_done = 1. - tf.cast(done, dtype=tf.float32) # Update Critic with tf.GradientTape(persistent=True) as tape: current_Q1 = self.qf1([states, actions]) current_Q2 = self.qf2([states, actions]) vf_next_target = self.vf_target(next_states) target_Q = tf.stop_gradient(self.scale_reward * rewards + not_done * self.discount * vf_next_target) td_loss1 = tf.reduce_mean( huber_loss(target_Q - current_Q1, delta=self.max_grad)) td_loss2 = tf.reduce_mean( huber_loss(target_Q - current_Q2, delta=self.max_grad)) q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables) self.qf1_optimizer.apply_gradients( zip(q1_grad, self.qf1.trainable_variables)) q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables) self.qf2_optimizer.apply_gradients( zip(q2_grad, self.qf2.trainable_variables)) del tape with tf.GradientTape(persistent=True) as tape: current_V = self.vf(states) sample_actions, logp = self.actor(states) current_Q1 = self.qf1([states, sample_actions]) current_Q2 = self.qf2([states, sample_actions]) current_Q = tf.minimum(current_Q1, current_Q2) target_V = tf.stop_gradient(current_Q - logp) td_errors = target_V - current_V vf_loss_t = tf.reduce_mean( huber_loss(td_errors, delta=self.max_grad) * weights) # TODO: Add reguralizer policy_loss = tf.reduce_mean(logp - current_Q1) vf_grad = tape.gradient(vf_loss_t, self.vf.trainable_variables) self.vf_optimizer.apply_gradients( zip(vf_grad, self.vf.trainable_variables)) update_target_variables(self.vf_target.weights, self.vf.weights, self.tau) actor_grad = tape.gradient(policy_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) del tape return td_errors, policy_loss, vf_loss_t, td_loss1, tf.reduce_min( logp), tf.reduce_max(logp)
def __init__( self, state_shape, action_dim, q_func=None, name="DQN", lr=0.001, adam_eps=1e-07, units=(32, 32), epsilon=0.1, epsilon_min=None, epsilon_decay_step=int(1e6), n_warmup=int(1e4), target_replace_interval=int(5e3), memory_capacity=int(1e6), enable_double_dqn=False, enable_dueling_dqn=False, enable_noisy_dqn=False, optimizer=None, **kwargs): """ Initialize DQN agent Args: state_shape (iterable of int): Observation space shape action_dim (int): Dimension of discrete action q_function (QFunc): Custom Q function class. If ``None`` (default), Q function is constructed with ``QFunc``. name (str): Name of agent. The default is ``"DQN"`` lr (float): Learning rate. The default is ``0.001``. adam_eps (float): Epsilon for Adam. The default is ``1e-7`` units (iterable of int): Units of hidden layers. The default is ``(32, 32)`` espilon (float): Initial epsilon of e-greedy. The default is ``0.1`` epsilon_min (float): Minimum epsilon of after decayed. epsilon_decay_step (int): Number of steps decaying. The default is ``1e6`` n_warmup (int): Number of warmup steps befor training. The default is ``1e4`` target_replace_interval (int): Number of steps between target network update. The default is ``5e3`` memory_capacity (int): Size of replay buffer. The default is ``1e6`` enable_double_dqn (bool): Whether use Double DQN. The default is ``False`` enable_dueling_dqn (bool): Whether use Dueling network. The default is ``False`` enable_noisy_dqn (bool): Whether use noisy network. The default is ``False`` optimizer (tf.keras.optimizers.Optimizer): Custom optimizer batch_size (int): Batch size. The default is ``256``. discount (float): Discount factor. The default is ``0.99``. max_grad (float): Maximum gradient. The default is ``10``. gpu (int): GPU id. ``-1`` disables GPU. The default is ``0``. """ super().__init__(name=name, memory_capacity=memory_capacity, n_warmup=n_warmup, **kwargs) q_func = q_func if q_func is not None else QFunc # Define and initialize Q-function network kwargs_dqn = { "state_shape": state_shape, "action_dim": action_dim, "units": units, "enable_dueling_dqn": enable_dueling_dqn, "enable_noisy_dqn": enable_noisy_dqn} self.q_func = q_func(**kwargs_dqn) self.q_func_target = q_func(**kwargs_dqn) self.q_func_optimizer = optimizer or tf.keras.optimizers.Adam(learning_rate=lr, epsilon=adam_eps) update_target_variables(self.q_func_target.weights, self.q_func.weights, tau=1.) self._action_dim = action_dim # This is used to check if input state to `get_action` is multiple (batch) or single self._state_ndim = np.array(state_shape).shape[0] # Set hyper-parameters if epsilon_min is not None and not enable_noisy_dqn: assert epsilon > epsilon_min self.epsilon_min = epsilon_min self.epsilon_decay_rate = (epsilon - epsilon_min) / epsilon_decay_step self.epsilon = max(epsilon - self.epsilon_decay_rate * self.n_warmup, self.epsilon_min) else: epsilon = epsilon if not enable_noisy_dqn else 0. self.epsilon = epsilon self.epsilon_min = epsilon self.epsilon_decay_rate = 0. self.target_replace_interval = target_replace_interval self.n_update = 0 # DQN variants self._enable_double_dqn = enable_double_dqn self._enable_noisy_dqn = enable_noisy_dqn
def _train_body(self, states, actions, next_states, rewards, dones, weights): with tf.device(self.device): batch_size = states.shape[0] not_dones = 1. - tf.cast(dones, dtype=tf.float32) actions = tf.cast(actions, dtype=tf.int32) indices = tf.concat( values=[tf.expand_dims(tf.range(batch_size), axis=1), actions], axis=1) with tf.GradientTape(persistent=True) as tape: # Compute critic loss next_action_prob = self.actor(next_states) next_action_logp = tf.math.log(next_action_prob + 1e-8) next_q = tf.minimum(self.qf1_target(next_states), self.qf2_target(next_states)) # Compute state value function V by directly computes expectation target_q = tf.expand_dims(tf.einsum( 'ij,ij->i', next_action_prob, next_q - self.alpha * next_action_logp), axis=1) # Eq.(10) target_q = tf.stop_gradient(rewards + not_dones * self.discount * target_q) current_q1 = self.qf1(states) current_q2 = self.qf2(states) td_loss1 = tf.reduce_mean( huber_loss(target_q - tf.expand_dims( tf.gather_nd(current_q1, indices), axis=1), delta=self.max_grad) * weights) td_loss2 = tf.reduce_mean( huber_loss(target_q - tf.expand_dims( tf.gather_nd(current_q2, indices), axis=1), delta=self.max_grad) * weights) # Eq.(7) # Compute actor loss current_action_prob = self.actor(states) current_action_logp = tf.math.log(current_action_prob + 1e-8) policy_loss = tf.reduce_mean( tf.einsum( 'ij,ij->i', current_action_prob, self.alpha * current_action_logp - tf.stop_gradient(tf.minimum(current_q1, current_q2))) * weights) # Eq.(12) mean_ent = tf.reduce_mean( tf.einsum('ij,ij->i', current_action_prob, current_action_logp)) * (-1) if self.auto_alpha: alpha_loss = -tf.reduce_mean( (self.log_alpha * tf.stop_gradient(current_action_logp + self.target_alpha))) q1_grad = tape.gradient(td_loss1, self.qf1.trainable_variables) self.qf1_optimizer.apply_gradients( zip(q1_grad, self.qf1.trainable_variables)) q2_grad = tape.gradient(td_loss2, self.qf2.trainable_variables) self.qf2_optimizer.apply_gradients( zip(q2_grad, self.qf2.trainable_variables)) if self.target_hard_update: if self.n_training % self.target_update_interval == 0: update_target_variables(self.qf1_target.weights, self.qf1.weights, tau=1.) update_target_variables(self.qf2_target.weights, self.qf2.weights, tau=1.) else: update_target_variables(self.qf1_target.weights, self.qf1.weights, tau=self.tau) update_target_variables(self.qf2_target.weights, self.qf2.weights, tau=self.tau) actor_grad = tape.gradient(policy_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) if self.auto_alpha: alpha_grad = tape.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) self.alpha.assign(tf.exp(self.log_alpha)) return (td_loss1 + td_loss2) / 2., policy_loss, mean_ent, \ tf.reduce_min(current_action_logp), tf.reduce_max(current_action_logp), \ tf.reduce_mean(current_action_logp)