class SAC_v2: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.tau = args.tau self.gamma = args.gamma self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.critic_update = args.critic_update self.log_alpha = tf.Variable(np.log(args.alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) action, _ = self.actor(state) action = np.clip(action.numpy()[0], -1, 1) return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) action, _ = self.actor(state, deterministic=True) action = np.clip(action.numpy()[0], -1, 1) return action def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(ns) target_min_aq = tf.minimum(self.target_critic1(ns, ns_action), self.target_critic2(ns, ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape1 with tf.GradientTape() as tape2: s_action, s_logpi = self.actor(s) min_aq_rep = tf.minimum(self.critic1(s, s_action), self.critic2(s, s_action)) actor_loss = 0.5 * tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(s) alpha_loss = -( tf.exp(self.log_alpha) * (tf.stop_gradient(s_logpi + self.target_entropy))) alpha_loss = tf.nn.compute_average_loss( alpha_loss) #from softlearning package alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss], ['Loss/alpha', total_alpha_loss], ['Alpha', tf.exp(self.log_alpha).numpy()]]
class DBC_SACv2: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.actor_update = args.actor_update self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.dynamics_model = Transition_Network(self.feature_dim, action_dim, deterministic=False) self.reward_model = Reward_Network(self.feature_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.current_step = 0 self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder, 'Dynamics': self.dynamics_model, 'Reward': self.reward_model } self.name = 'DBC_SACv2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature) action = action.numpy()[0] return action def eval_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature, deterministic=True) action = action.numpy()[0] return action def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 total_encoder_loss = 0 total_dynamics_loss = 0 total_reward_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 if self.current_step % self.actor_update == 0: with tf.GradientTape() as tape2: s_action, s_logpi = self.actor( tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -( tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) #alpha_loss = tf.reduce_mean(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) #train encoder with tf.GradientTape() as tape4: new_ids = np.arange(len(s)) np.random.shuffle(new_ids) s2 = tf.gather(s, new_ids) feature = self.encoder(s) #feature2 = tf.gather(feature, new_ids) feature2 = self.encoder(s2) reward = self.reward_model(tf.stop_gradient(feature)) #reward2 = tf.gather(reward, new_ids) reward2 = self.reward_model(tf.stop_gradient(feature2)) feature_action, _ = self.actor(tf.stop_gradient(feature), True) feature2_action, _ = self.actor(tf.stop_gradient(feature2), True) mu, sigma = self.dynamics_model(tf.stop_gradient(feature), feature_action) mu2, sigma2 = self.dynamics_model(tf.stop_gradient(feature2), feature2_action) z_dist = tf.reshape(tf.keras.losses.huber(feature, feature2), shape=[-1, 1]) r_dist = tf.reshape(tf.keras.losses.huber(reward, reward2), shape=[-1, 1]) transition_dist = tf.sqrt( tf.square(mu - mu2) + tf.square(sigma - sigma2)) bisimilarity = r_dist + self.gamma * transition_dist encoder_loss = tf.reduce_mean(tf.square(z_dist - bisimilarity)) encoder_gradients = tape4.gradient(encoder_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) #train dynamics with tf.GradientTape() as tape5: feature = self.encoder(s) mu, sigma = self.dynamics_model(feature, a) if (sigma[0][0].numpy() == 0): if self.dynamics_model.deterministic == False: print("error") sigma = tf.ones_like(mu) next_feature = self.encoder(ns) diff = (mu - tf.stop_gradient(next_feature)) / sigma dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) + tf.math.log(sigma)) dynamics_gradients = tape5.gradient( dynamics_loss, self.encoder.trainable_variables + self.dynamics_model.trainable_variables) self.dynamics_optimizer.apply_gradients( zip( dynamics_gradients, self.encoder.trainable_variables + self.dynamics_model.trainable_variables)) #train reward with tf.GradientTape() as tape6: feature = self.encoder(s) sample_dynamics = self.dynamics_model.sample(feature, a) reward_prediction = self.reward_model(sample_dynamics) reward_loss = tf.reduce_mean(tf.square(reward_prediction - r)) reward_gradients = tape6.gradient( reward_loss, self.encoder.trainable_variables + self.reward_model.trainable_variables) self.reward_optimizer.apply_gradients( zip( reward_gradients, self.encoder.trainable_variables + self.reward_model.trainable_variables)) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) if self.current_step % self.actor_update == 0: total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_encoder_loss += encoder_loss.numpy() loss_list.append(['Loss/Encoder', total_encoder_loss]) total_dynamics_loss += dynamics_loss.numpy() loss_list.append(['Loss/Dynamics', total_dynamics_loss]) total_reward_loss += reward_loss.numpy() loss_list.append(['Loss/Reward', total_reward_loss]) if self.current_step % self.actor_update == 0 and self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
class TD3: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.actor_lr = args.actor_lr self.critic_lr = args.critic_lr self.policy_delay = args.policy_delay self.actor_noise = args.actor_noise self.target_noise = args.target_noise self.noise_clip = args.noise_clip self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'TD3' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) noise = np.random.normal(loc=0, scale=self.actor_noise, size=self.action_dim) action = self.actor(state).numpy()[0] + noise action = np.clip(action, -1, 1) return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) action = self.actor(state).numpy()[0] action = np.clip(action, -1, 1) return action def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) target_action = tf.clip_by_value( self.target_actor(ns) + tf.clip_by_value( tf.random.normal(shape=self.target_actor(ns).shape, mean=0, stddev=self.target_noise), -self.noise_clip, self.noise_clip), -1, 1) target_value = tf.stop_gradient( r + self.gamma * (1 - d) * tf.minimum(self.target_critic1(ns, target_action), self.target_critic2(ns, target_action))) with tf.GradientTape(persistent=True) as tape: critic1_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic1(s, a))) critic2_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic2(s, a))) critic1_grad = tape.gradient(critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_grad, self.critic1.trainable_variables)) critic2_grad = tape.gradient(critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_grad, self.critic2.trainable_variables)) if self.current_step % self.policy_delay == 0: with tf.GradientTape() as tape2: actor_loss = -tf.reduce_mean(self.critic1( s, self.actor(s))) actor_grad = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) del tape, tape2 total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss]]
class DDQN: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.lr = args.learning_rate self.epsilon = args.epsilon self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.copy_iter = args.copy_iter self.network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.network, self.target_network) self.network_list = { 'Network': self.network, 'Target_Network': self.target_network } self.name = 'Double DQN' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) q_value = self.network(state, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] if np.random.random() < self.epsilon: return np.random.randint(low=0, high=self.action_dim) else: return best_action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) q_value = self.network(state, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] return best_action def train(self, training_num): total_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) q_value = tf.expand_dims(tf.argmax(self.network( ns, activation='linear'), axis=1, output_type=tf.int32), axis=1) q_value_one = tf.squeeze(tf.one_hot(q_value, depth=self.action_dim), axis=1) target_value = r + self.gamma * (1 - d) * tf.reduce_sum( self.target_network(ns, activation='linear') * q_value_one, axis=1, keepdims=True) target_value = tf.stop_gradient(target_value) with tf.GradientTape() as tape: selected_values = tf.reduce_sum( self.network(s, activation='linear') * tf.squeeze( tf.one_hot(tf.cast(a, tf.int32), self.action_dim), axis=1), axis=1, keepdims=True) loss = 0.5 * tf.math.reduce_mean( tf.square(target_value - selected_values)) variables = self.network.trainable_variables gradients = tape.gradient(loss, variables) self.optimizer.apply_gradients(zip(gradients, variables)) if self.current_step % self.copy_iter == 0: copy_weight(self.network, self.target_network) total_loss += loss.numpy() return [['Loss/Loss', total_loss]]
class ImageDQN: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) self.obs_dim = obs_dim self.action_dim = action_dim self.feature_dim = args.feature_dim self.batch_size = args.batch_size self.gamma = args.gamma self.learning_rate = args.learning_rate self.epsilon = args.epsilon self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.copy_iter = args.copy_iter self.layer_num = args.layer_num self.filter_num = args.filter_num self.network = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_network = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num, 'channels_last') self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num, 'channels_last') copy_weight(self.network, self.target_network) copy_weight(self.encoder, self.target_encoder) self.network_list = { 'Network': self.network, 'Target_Network': self.target_network } self.name = 'ImageDQN' def get_action(self, obs): if np.random.random() < self.epsilon: return np.random.randint(low=0, high=self.action_dim) else: obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) q_value = self.network(feature, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] return best_action def eval_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) q_value = self.network(feature, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] return best_action def train(self, training_num): total_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) target_q = tf.reduce_max(self.target_network( self.target_encoder(ns), activation='linear'), axis=1, keepdims=True) target_value = r + self.gamma * (1 - d) * target_q target_value = tf.stop_gradient(target_value) with tf.GradientTape() as tape: selected_values = tf.reduce_sum( self.network(self.encoder(s), activation='linear') * tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), self.action_dim), axis=1), axis=1, keepdims=True) loss = 0.5 * tf.reduce_mean( tf.square(target_value - selected_values)) gradients = tape.gradient( loss, self.encoder.trainable_variables + self.network.trainable_variables) self.optimizer.apply_gradients( zip( gradients, self.encoder.trainable_variables + self.network.trainable_variables)) if self.current_step % self.copy_iter == 0: copy_weight(self.network, self.target_network) copy_weight(self.encoder, self.target_encoder) total_loss += loss.numpy() del tape return [['Loss/Loss', total_loss]]
class SACv2_AE: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = args.image_size self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.actor_update = args.actor_update self.critic_update = args.critic_update self.decoder_update = args.decoder_update self.decoder_latent_lambda = args.decoder_latent_lambda self.decoder_weight_lambda = args.decoder_weight_lambda self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.decoder = PixelDecoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.decoder_optimizer = tfa.optimizers.AdamW( weight_decay=self.decoder_weight_lambda, learning_rate=args.decoder_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder, 'Decoder': self.decoder } self.name = 'SACv2_AE' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature) action = action.numpy()[0] return action def eval_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature, deterministic=True) action = action.numpy()[0] return action def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 total_ae_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) #critic update with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 #actor update if self.current_step % self.actor_update == 0: with tf.GradientTape() as tape2: s_action, s_logpi = self.actor( tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 #alpha update if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -( tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.decoder_update == 0: #encoder, decoder update with tf.GradientTape(persistent=True) as tape4: feature = self.encoder(s) recovered_s = self.decoder(feature) real_s = preprocess_obs(s) rec_loss = tf.reduce_mean(tf.square(recovered_s - real_s)) latent_loss = tf.reduce_mean( 0.5 * tf.reduce_sum(tf.square(feature), axis=1)) ae_loss = rec_loss + self.decoder_latent_lambda * latent_loss encoder_gradients = tape4.gradient( ae_loss, self.encoder.trainable_variables) decoder_gradients = tape4.gradient( ae_loss, self.decoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) self.decoder_optimizer.apply_gradients( zip(decoder_gradients, self.decoder.trainable_variables)) if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) del tape4 total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) if self.current_step % self.decoder_update == 0: total_ae_loss += ae_loss.numpy() loss_list.append(['Loss/AutoEncoder', total_ae_loss]) if self.current_step % self.actor_update == 0: total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
class SAC_v2: def __init__(self, state_dim, action_dim, hidden_dim=256, training_step=1, alpha=0.1, train_alpha=True, batch_size=128, buffer_size=1e6, tau=0.005, learning_rate=0.0003, gamma=0.99, reward_scale=1, training_start=500): self.buffer = Buffer(buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = batch_size self.tau = tau self.gamma = gamma self.reward_scale = reward_scale self.training_start = training_start self.training_step = training_step self.log_alpha = tf.Variable(np.log(alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate) self.train_alpha = train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) action = self.actor(state).numpy()[0] return action def train(self, training_num): for i in range(training_num): s, a, r, ns, d = self.buffer.sample(self.batch_size) target_min_aq = tf.minimum(self.target_critic1(ns, self.actor(ns)), self.target_critic2(ns, self.actor(ns))) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * self.actor.log_pi(ns))) #critic training with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape1 #actor training with tf.GradientTape() as tape2: mu, sigma = self.actor.mu_sigma(s) output = mu + tf.random.normal(shape=mu.shape) * sigma min_aq_rep = tf.minimum(self.critic1(s, output), self.critic2(s, output)) actor_loss = tf.reduce_mean(self.alpha.numpy() * self.actor.log_pi(s) - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 #alpha(temperature) training if self.train_alpha == True: with tf.GradientTape() as tape3: alpha_loss = -(tf.exp(self.log_alpha) * (tf.stop_gradient( self.actor.log_pi(s) + self.target_entropy))) alpha_loss = tf.nn.compute_average_loss( alpha_loss) #from softlearning package alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) del tape3 soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau)
class SAC_v1: def __init__(self, state_dim, action_dim, hidden_dim=256, training_step=1, batch_size=128, buffer_size=1e6, tau=0.005, learning_rate=0.0003, gamma=0.99, alpha=0.2, reward_scale=1, training_start=500): self.buffer = Buffer(buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.v_network_optimizer = tf.keras.optimizers.Adam(learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = batch_size self.tau = tau self.gamma = gamma self.alpha = alpha self.reward_scale = reward_scale self.training_start = training_start self.training_step = training_step self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.v_network = V_network(self.state_dim, (hidden_dim, hidden_dim)) self.target_v_network = V_network(self.state_dim, (hidden_dim, hidden_dim)) copy_weight(self.v_network, self.target_v_network) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network } self.name = 'SAC_v1' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) action = self.actor(state).numpy()[0] return action def train(self, training_num): for i in range(training_num): s, a, r, ns, d = self.buffer.sample(self.batch_size) min_aq = tf.minimum(self.critic1(s, self.actor(s)), self.critic2(s, self.actor(s))) target_v = tf.stop_gradient(min_aq - self.alpha * self.actor.log_pi(s)) #v_network training with tf.GradientTape(persistent=True) as tape1: v_loss = 0.5 * tf.reduce_mean( tf.square(self.v_network(s) - target_v)) v_gradients = tape1.gradient(v_loss, self.v_network.trainable_variables) self.v_network_optimizer.apply_gradients( zip(v_gradients, self.v_network.trainable_variables)) del tape1 target_q = tf.stop_gradient(r + self.gamma * (1 - d) * self.target_v_network(ns)) #critic training with tf.GradientTape(persistent=True) as tape2: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape2.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape2.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape2 #actor training with tf.GradientTape() as tape3: mu, sigma = self.actor.mu_sigma(s) output = mu + tf.random.normal(shape=sigma.shape) * sigma min_aq_rep = tf.minimum(self.critic1(s, output), self.critic2(s, output)) actor_loss = tf.reduce_mean(self.alpha * self.actor.log_pi(s) - min_aq_rep) actor_grad = tape3.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) del tape3 soft_update(self.v_network, self.target_v_network, self.tau)
class SAC_v1: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.tau = args.tau self.gamma = args.gamma self.alpha = args.alpha self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.v_network = V_network(self.state_dim, args.hidden_dim) self.target_v_network = V_network(self.state_dim, args.hidden_dim) copy_weight(self.v_network, self.target_v_network) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network } self.name = 'SAC_v1' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) action, _ = self.actor(state) action = np.clip(action.numpy()[0], -1, 1) return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) action, _ = self.actor(state, deterministic=True) action = np.clip(action.numpy()[0], -1, 1) return action def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_v_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) s_action, s_logpi = self.actor(s) min_aq = tf.minimum(self.critic1(s, s_action), self.critic2(s, s_action)) target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi) with tf.GradientTape() as tape1: v_loss = 0.5 * tf.reduce_mean( tf.square(self.v_network(s) - target_v)) v_gradients = tape1.gradient(v_loss, self.v_network.trainable_variables) self.v_network_optimizer.apply_gradients( zip(v_gradients, self.v_network.trainable_variables)) target_q = tf.stop_gradient(r + self.gamma * (1 - d) * self.target_v_network(ns)) with tf.GradientTape(persistent=True) as tape2: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape2.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape2.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) with tf.GradientTape() as tape3: s_action, s_logpi = self.actor(s) min_aq_rep = tf.minimum(self.critic1(s, s_action), self.critic2(s, s_action)) actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep) actor_grad = tape3.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) soft_update(self.v_network, self.target_v_network, self.tau) del tape1, tape2, tape3 total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() total_v_loss += v_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss], ['Loss/V', total_v_loss]]
class DDPG: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.noise_scale = args.noise_scale self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic, self.target_critic) self.network_list = { 'Actor': self.actor, 'Target_Actor': self.target_actor, 'Critic': self.critic, 'Target_Critic': self.target_critic } self.name = 'DDPG' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) noise = np.random.normal(loc=0, scale=self.noise_scale, size=self.action_dim) action = self.actor(state).numpy()[0] + noise action = np.clip(action, -1, 1) return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) action = self.actor(state).numpy()[0] action = np.clip(action, -1, 1) return action def train(self, training_num): total_a_loss = 0 total_c_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) value_next = tf.stop_gradient( self.target_critic(ns, self.target_actor(ns))) target_value = r + (1 - d) * self.gamma * value_next with tf.GradientTape(persistent=True) as tape: critic_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic(s, a))) actor_loss = -tf.reduce_mean(self.critic(s, self.actor(s))) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( (zip(actor_grad, self.actor.trainable_variables))) soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic, self.target_critic, self.tau) del tape total_a_loss += actor_loss.numpy() total_c_loss += critic_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic', total_c_loss]]