def __init__(self, state_dim, action_dim, hidden_dim=256, training_step=1, alpha=0.1, train_alpha=True, batch_size=128, buffer_size=1e6, tau=0.005, learning_rate=0.0003, gamma=0.99, reward_scale=1, training_start=500): self.buffer = Buffer(buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = batch_size self.tau = tau self.gamma = gamma self.reward_scale = reward_scale self.training_start = training_start self.training_step = training_step self.log_alpha = tf.Variable(np.log(alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate) self.train_alpha = train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.lr = args.learning_rate self.epsilon = args.epsilon self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.copy_iter = args.copy_iter self.network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.network, self.target_network) self.network_list = { 'Network': self.network, 'Target_Network': self.target_network } self.name = 'Double DQN'
def __init__(self, state_dim, action_dim, hidden_dim=256, training_step=1, batch_size=128, buffer_size=1e6, tau=0.005, learning_rate=0.0003, gamma=0.99, alpha=0.2, reward_scale=1, training_start=500): self.buffer = Buffer(buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.v_network_optimizer = tf.keras.optimizers.Adam(learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = batch_size self.tau = tau self.gamma = gamma self.alpha = alpha self.reward_scale = reward_scale self.training_start = training_start self.training_step = training_step self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.v_network = V_network(self.state_dim, (hidden_dim, hidden_dim)) self.target_v_network = V_network(self.state_dim, (hidden_dim, hidden_dim)) copy_weight(self.v_network, self.target_v_network) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network } self.name = 'SAC_v1'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.actor_lr = args.actor_lr self.critic_lr = args.critic_lr self.policy_delay = args.policy_delay self.actor_noise = args.actor_noise self.target_noise = args.target_noise self.noise_clip = args.noise_clip self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'TD3'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.tau = args.tau self.gamma = args.gamma self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.critic_update = args.critic_update self.log_alpha = tf.Variable(np.log(args.alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2'
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) self.obs_dim = obs_dim self.action_dim = action_dim self.feature_dim = args.feature_dim self.batch_size = args.batch_size self.gamma = args.gamma self.learning_rate = args.learning_rate self.epsilon = args.epsilon self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.copy_iter = args.copy_iter self.layer_num = args.layer_num self.filter_num = args.filter_num self.network = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_network = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num, 'channels_last') self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num, 'channels_last') copy_weight(self.network, self.target_network) copy_weight(self.encoder, self.target_encoder) self.network_list = { 'Network': self.network, 'Target_Network': self.target_network } self.name = 'ImageDQN'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.tau = args.tau self.gamma = args.gamma self.alpha = args.alpha self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.v_network = V_network(self.state_dim, args.hidden_dim) self.target_v_network = V_network(self.state_dim, args.hidden_dim) copy_weight(self.v_network, self.target_v_network) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network } self.name = 'SAC_v1'
def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.noise_scale = args.noise_scale self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic, self.target_critic) self.network_list = { 'Actor': self.actor, 'Target_Actor': self.target_actor, 'Critic': self.critic, 'Target_Critic': self.target_critic } self.name = 'DDPG'
def __init__(self, state_dim, action_dim, args): self.discrete = args.discrete self.buffer = Buffer(args.buffer_size) self.gamma = args.gamma self.lambda_gae = args.lambda_gae self.batch_size = args.batch_size self.backtrack_iter = args.backtrack_iter self.backtrack_coeff = args.backtrack_coeff self.delta = args.delta self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.training_start = 0 self.training_step = args.training_step if self.discrete == True: self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.backup_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) else: self.actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.backup_actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.critic = V_network(self.state_dim) self.network_list = {'Actor': self.actor, 'Critic': self.critic} self.name = 'TRPO'
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.current_step = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.policy_delay = args.policy_delay self.actor_noise = args.actor_noise self.target_noise = args.target_noise self.noise_clip = args.noise_clip self.training_start = args.training_start self.training_step = args.training_step self.actor = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.curl = CURL(self.feature_dim, self.curl_latent_dim) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_TD3'
class CURL_TD3: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.current_step = 0 self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.policy_delay = args.policy_delay self.actor_noise = args.actor_noise self.target_noise = args.target_noise self.noise_clip = args.noise_clip self.training_start = args.training_start self.training_step = args.training_step self.actor = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.curl = CURL(self.feature_dim, self.curl_latent_dim) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_TD3' def get_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) noise = np.random.normal(loc=0, scale=self.actor_noise, size=self.action_dim) feature = self.encoder(obs) action = self.actor(feature).numpy()[0] + noise action = np.clip(action, -1, 1) return action def eval_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action = self.actor(feature).numpy()[0] action = np.clip(action, -1, 1) return action def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_cpc_loss = 0 loss_list = [] s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample( self.batch_size, self.image_size) obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"] with tf.GradientTape(persistent=True) as tape: z_a = self.encoder(obs_anchor) z_pos = tf.stop_gradient(self.target_encoder(obs_pos)) logits = self.curl.compute_logits(z_a, z_pos) labels = tf.range(logits.shape[0], dtype='int64') cpc_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)) cpc_gradients = tape.gradient(cpc_loss, self.curl.trainable_variables) self.cpc_optimizer.apply_gradients( zip(cpc_gradients, self.curl.trainable_variables)) encoder_gradients = tape.gradient(cpc_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) total_cpc_loss += cpc_loss.numpy() loss_list.append(['Loss/CPC', total_cpc_loss]) del tape if self.current_step % 2 == 0: target_action = tf.clip_by_value( self.target_actor(self.target_encoder(ns)) + tf.clip_by_value( tf.random.normal(shape=self.target_actor( self.target_encoder(ns)).shape, mean=0, stddev=self.target_noise), -self.noise_clip, self.noise_clip), -1, 1) target_value = tf.stop_gradient(r + self.gamma * ( 1 - d ) * tf.minimum( self.target_critic1(self.target_encoder(ns), target_action), self.target_critic2(self.target_encoder(ns), target_action))) with tf.GradientTape(persistent=True) as tape: critic1_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic1(self.encoder(s), a))) critic2_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic2(self.encoder(s), a))) critic1_grad = tape.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_grad, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_grad = tape.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_grad, self.encoder.trainable_variables + self.critic2.trainable_variables)) if self.current_step % (2 * self.policy_delay) == 0: with tf.GradientTape() as tape2: actor_loss = -tf.reduce_mean( self.critic1( tf.stop_gradient(self.encoder(s)), self.actor(tf.stop_gradient(self.encoder(s))))) actor_grad = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) return loss_list
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.curl = CURL(self.feature_dim, self.curl_latent_dim) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_SACv2'
class CURL_SACv2: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.curl = CURL(self.feature_dim, self.curl_latent_dim) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_SACv2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature) action = action.numpy()[0] return action def eval_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature, deterministic=True) action = action.numpy()[0] return action def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_cpc_loss = 0 total_alpha_loss = 0 loss_list = [] s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample( self.batch_size, self.image_size) obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"] ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 with tf.GradientTape() as tape2: s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) #alpha_loss = tf.reduce_mean(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) with tf.GradientTape(persistent=True) as tape4: z_a = self.encoder(obs_anchor) z_pos = tf.stop_gradient(self.target_encoder(obs_pos)) logits = self.curl.compute_logits(z_a, z_pos) labels = tf.range(logits.shape[0], dtype='int64') cpc_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)) cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables) self.cpc_optimizer.apply_gradients( zip(cpc_gradients, self.curl.trainable_variables)) encoder_gradients = tape4.gradient(cpc_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) del tape4 total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_cpc_loss += cpc_loss.numpy() loss_list.append(['Loss/CPC', total_cpc_loss]) if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
def __init__(self, obs_dim, action_dim, hidden_dim=256, gamma=0.99, learning_rate=1e-5, batch_size=128, buffer_size=1e6, feature_dim=50, layer_num=4, filter_num=32, tau=0.005, encoder_tau=0.005, bisim_coef=0.5, training_start=1000, train_alpha=True, alpha=0.1): self.buffer = Buffer(buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), trainable=True) self.target_entropy = -action_dim self.hidden_dim = hidden_dim self.gamma = gamma self.learning_rate = learning_rate self.bisim_coef = bisim_coef self.batch_size = batch_size self.feature_dim = feature_dim self.layer_num = layer_num self.filter_num = filter_num self.tau = tau self.encoder_tau = encoder_tau self.training_start = training_start self.train_alpha = train_alpha self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.target_encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.dynamics_model = Transition_Network(feature_dim, action_dim, deterministic=False) self.reward_model = Reward_Network(feature_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate) self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate) self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate) self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate) self.name = 'DBC_SACv2'
class SAC_v2: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.tau = args.tau self.gamma = args.gamma self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.critic_update = args.critic_update self.log_alpha = tf.Variable(np.log(args.alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) action, _ = self.actor(state) action = np.clip(action.numpy()[0], -1, 1) return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) action, _ = self.actor(state, deterministic=True) action = np.clip(action.numpy()[0], -1, 1) return action def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(ns) target_min_aq = tf.minimum(self.target_critic1(ns, ns_action), self.target_critic2(ns, ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape1 with tf.GradientTape() as tape2: s_action, s_logpi = self.actor(s) min_aq_rep = tf.minimum(self.critic1(s, s_action), self.critic2(s, s_action)) actor_loss = 0.5 * tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(s) alpha_loss = -( tf.exp(self.log_alpha) * (tf.stop_gradient(s_logpi + self.target_entropy))) alpha_loss = tf.nn.compute_average_loss( alpha_loss) #from softlearning package alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss], ['Loss/alpha', total_alpha_loss], ['Alpha', tf.exp(self.log_alpha).numpy()]]
class TD3: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.actor_lr = args.actor_lr self.critic_lr = args.critic_lr self.policy_delay = args.policy_delay self.actor_noise = args.actor_noise self.target_noise = args.target_noise self.noise_clip = args.noise_clip self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'TD3' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) noise = np.random.normal(loc=0, scale=self.actor_noise, size=self.action_dim) action = self.actor(state).numpy()[0] + noise action = np.clip(action, -1, 1) return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) action = self.actor(state).numpy()[0] action = np.clip(action, -1, 1) return action def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) target_action = tf.clip_by_value( self.target_actor(ns) + tf.clip_by_value( tf.random.normal(shape=self.target_actor(ns).shape, mean=0, stddev=self.target_noise), -self.noise_clip, self.noise_clip), -1, 1) target_value = tf.stop_gradient( r + self.gamma * (1 - d) * tf.minimum(self.target_critic1(ns, target_action), self.target_critic2(ns, target_action))) with tf.GradientTape(persistent=True) as tape: critic1_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic1(s, a))) critic2_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic2(s, a))) critic1_grad = tape.gradient(critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_grad, self.critic1.trainable_variables)) critic2_grad = tape.gradient(critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_grad, self.critic2.trainable_variables)) if self.current_step % self.policy_delay == 0: with tf.GradientTape() as tape2: actor_loss = -tf.reduce_mean(self.critic1( s, self.actor(s))) actor_grad = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) del tape, tape2 total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss]]
class TRPO: def __init__(self, state_dim, action_dim, args): self.discrete = args.discrete self.buffer = Buffer(args.buffer_size) self.gamma = args.gamma self.lambda_gae = args.lambda_gae self.batch_size = args.batch_size self.backtrack_iter = args.backtrack_iter self.backtrack_coeff = args.backtrack_coeff self.delta = args.delta self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.training_start = 0 self.training_step = args.training_step if self.discrete == True: self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.backup_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) else: self.actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.backup_actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.critic = V_network(self.state_dim) self.network_list = {'Actor': self.actor, 'Critic': self.critic} self.name = 'TRPO' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax').numpy()[0] action = np.random.choice(self.action_dim, 1, p=policy)[0] else: action = self.actor(state).numpy()[0] return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax').numpy()[0] action = np.argmax(policy) else: action = self.actor(state, deterministic=True).numpy()[0] return action def fisher_vector_product(self, states, p): with tf.GradientTape() as tape2: with tf.GradientTape() as tape1: if self.discrete == True: kl_divergence = tfp.distributions.kl_divergence( tfp.distributions.Categorical( probs=self.actor(states, activation='softmax')), tfp.distributions.Categorical(probs=self.backup_actor( states, activation='softmax'))) else: dist = self.actor.dist(states) backup_dist = self.backup_actor.dist(states) kl_divergence = tfp.distributions.kl_divergence( dist, backup_dist) kl_divergence = tf.reduce_mean(kl_divergence) kl_grad = tape1.gradient(kl_divergence, self.actor.trainable_variables) flatten_kl_grad = tf.concat( [tf.reshape(grad, [-1]) for grad in kl_grad], axis=0) kl_grad_p = tf.reduce_sum(flatten_kl_grad * p) kl_hessian_p = tape2.gradient(kl_grad_p, self.actor.trainable_variables) flatten_kl_hessian_p = tf.concat( [tf.reshape(hessian, [-1]) for hessian in kl_hessian_p], axis=0).numpy() return flatten_kl_hessian_p + 0.1 * p def conjugate_gradient(self, states, b, nsteps): x = np.zeros_like(b) r = copy.deepcopy(b) p = copy.deepcopy(r) rdotr = np.dot(r, r) for i in range(nsteps): _Avp = self.fisher_vector_product(states, p) alpha = rdotr / (np.dot(p, _Avp) + 1e-8) x += alpha * p r -= alpha * _Avp new_rdotr = np.dot(r, r) beta = new_rdotr / (rdotr + 1e-8) p = r + beta * p rdotr = new_rdotr return x def update_model(self, model, new_variables): index = 0 for variable in model.trainable_variables: variable_length = len(tf.reshape(variable, [-1])) new_variable = new_variables[index:index + variable_length] new_variable = tf.reshape(new_variable, tf.shape(variable)) variable.assign(new_variable) index += variable_length def train(self, training_num): total_c_loss = 0 s, a, r, ns, d = self.buffer.all_sample() old_values = self.critic(s) returns = np.zeros_like(r.numpy()) advantages = np.zeros_like(returns) running_return = np.zeros(1) previous_value = np.zeros(1) running_advantage = np.zeros(1) for t in reversed(range(len(r))): #General Advantage Estimation running_return = (r[t] + self.gamma * running_return * (1 - d[t])).numpy() running_tderror = (r[t] + self.gamma * previous_value * (1 - d[t]) - old_values[t]).numpy() running_advantage = ( running_tderror + (self.gamma * self.lambda_gae) * running_advantage * (1 - d[t])).numpy() returns[t] = running_return previous_value = old_values[t] advantages[t] = running_advantage if self.discrete == True: old_policy = self.actor(s, activation='softmax') old_a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), depth=self.action_dim), axis=1) old_log_policy = tf.reduce_sum(tf.math.log(old_policy) * tf.stop_gradient(old_a_one_hot), axis=1, keepdims=True) else: old_dist = self.actor.dist(s) old_log_policy = old_dist.log_prob(a) old_log_policy = tf.expand_dims(old_log_policy, axis=1) flattened_actor = tf.concat([ tf.reshape(variable, [-1]) for variable in self.actor.trainable_variables ], axis=0) self.update_model(self.backup_actor, flattened_actor) with tf.GradientTape() as tape: if self.discrete == True: policy = self.actor(s, activation='softmax') a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), depth=self.action_dim), axis=1) log_policy = tf.reduce_sum(tf.math.log(policy) * tf.stop_gradient(a_one_hot), axis=1, keepdims=True) surrogate = tf.reduce_mean( tf.exp(log_policy - tf.stop_gradient(old_log_policy)) * advantages) else: dist = self.actor.dist(s) log_policy = dist.log_prob(a) log_policy = tf.expand_dims(log_policy, axis=1) surrogate = tf.reduce_mean( tf.exp(log_policy - tf.stop_gradient(old_log_policy)) * advantages) policy_grad = tape.gradient(surrogate, self.actor.trainable_variables) flatten_policy_grad = tf.concat( [tf.reshape(grad, [-1]) for grad in policy_grad], axis=0) step_dir = self.conjugate_gradient(s, flatten_policy_grad.numpy(), 10) shs = 0.5 * tf.reduce_sum( step_dir * self.fisher_vector_product(s, step_dir), axis=0) step_size = 1 / tf.sqrt(shs / self.delta) full_step = step_size * step_dir expected_improve = tf.reduce_sum(flatten_policy_grad * full_step, axis=0) flag = False fraction = 1.0 for i in range(self.backtrack_iter): new_flattened_actor = flattened_actor + fraction * full_step self.update_model(self.actor, new_flattened_actor) if self.discrete == True: new_policy = self.actor(s, activation='softmax') new_a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), depth=self.action_dim), axis=1) new_log_policy = tf.reduce_sum(tf.math.log(new_policy) * tf.stop_gradient(new_a_one_hot), axis=1, keepdims=True) else: new_dist = self.actor.dist(s) new_log_policy = new_dist.log_prob(a) new_log_policy = tf.expand_dims(new_log_policy, axis=1) new_surrogate = tf.reduce_mean( tf.exp(new_log_policy - old_log_policy) * advantages) loss_improve = new_surrogate - surrogate expected_improve *= fraction if self.discrete == True: new_kl_divergence = tfp.distributions.kl_divergence( tfp.distributions.Categorical( probs=self.actor(s, activation='softmax')), tfp.distributions.Categorical( probs=self.backup_actor(s, activation='softmax'))) else: new_dist = self.actor.dist(s) backup_dist = self.backup_actor.dist(s) new_kl_divergence = tfp.distributions.kl_divergence( new_dist, backup_dist) new_kl_divergence = tf.reduce_mean(new_kl_divergence) #print('kl: {:.4f} loss improve: {:.4f} expected improve: {:.4f} ' 'number of line search: {}'.format(new_kl_divergence.numpy(), loss_improve, expected_improve, i)) if new_kl_divergence.numpy( ) <= self.delta and loss_improve >= expected_improve: flag = True break fraction *= self.backtrack_coeff if not flag: self.update_model(self.actor, flattened_actor) print("Policy update failed") #critic_train n = len(s) arr = np.arange(n) for epoch in range(self.training_step): if n // self.batch_size > 0: np.random.shuffle(arr) batch_index = arr[:self.batch_size] batch_index.sort() else: batch_index = arr batch_s = s.numpy()[batch_index] batch_returns = returns[batch_index] with tf.GradientTape() as tape: critic_loss = 0.5 * tf.reduce_mean( tf.square( tf.stop_gradient(batch_returns) - self.critic(batch_s))) critic_variables = self.critic.trainable_variables critic_gradients = tape.gradient(critic_loss, critic_variables) self.critic_optimizer.apply_gradients( zip(critic_gradients, critic_variables)) total_c_loss += critic_loss.numpy() self.buffer.delete() return [['Loss/Critic', total_c_loss]]
class SACv2_AE: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = args.image_size self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.actor_update = args.actor_update self.critic_update = args.critic_update self.decoder_update = args.decoder_update self.decoder_latent_lambda = args.decoder_latent_lambda self.decoder_weight_lambda = args.decoder_weight_lambda self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.decoder = PixelDecoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.decoder_optimizer = tfa.optimizers.AdamW( weight_decay=self.decoder_weight_lambda, learning_rate=args.decoder_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder, 'Decoder': self.decoder } self.name = 'SACv2_AE' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature) action = action.numpy()[0] return action def eval_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature, deterministic=True) action = action.numpy()[0] return action def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 total_ae_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) #critic update with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 #actor update if self.current_step % self.actor_update == 0: with tf.GradientTape() as tape2: s_action, s_logpi = self.actor( tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 #alpha update if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -( tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.decoder_update == 0: #encoder, decoder update with tf.GradientTape(persistent=True) as tape4: feature = self.encoder(s) recovered_s = self.decoder(feature) real_s = preprocess_obs(s) rec_loss = tf.reduce_mean(tf.square(recovered_s - real_s)) latent_loss = tf.reduce_mean( 0.5 * tf.reduce_sum(tf.square(feature), axis=1)) ae_loss = rec_loss + self.decoder_latent_lambda * latent_loss encoder_gradients = tape4.gradient( ae_loss, self.encoder.trainable_variables) decoder_gradients = tape4.gradient( ae_loss, self.decoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) self.decoder_optimizer.apply_gradients( zip(decoder_gradients, self.decoder.trainable_variables)) if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) del tape4 total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) if self.current_step % self.decoder_update == 0: total_ae_loss += ae_loss.numpy() loss_list.append(['Loss/AutoEncoder', total_ae_loss]) if self.current_step % self.actor_update == 0: total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
class DBC_TD3: def __init__(self, obs_dim, action_dim, hidden_dim=512, gamma=0.99, learning_rate=0.001, batch_size=512, policy_delay=2, actor_noise=0.1, target_noise=0.2, noise_clip=0.5, buffer_size=1e6, feature_dim=50, layer_num=4, filter_num=32, tau=0.005, encoder_tau=0.005, bisim_coef=0.5, training_start=1000): self.buffer = Buffer(buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.gamma = gamma self.learning_rate = learning_rate self.batch_size = batch_size self.feature_dim = feature_dim self.layer_num = layer_num self.filter_num = filter_num self.tau = tau self.encoder_tau = encoder_tau self.bisim_coef = bisim_coef self.policy_delay = policy_delay self.actor_noise = actor_noise self.target_noise = target_noise self.noise_clip = noise_clip self.training_start = training_start self.actor = Policy_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_actor = Policy_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.target_encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.dynamics_model = Transition_Network(feature_dim, action_dim) self.reward_model = Reward_Network(feature_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate) self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate) self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate) self.name = 'DBC_TD3' def get_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action = self.actor(feature).numpy()[0] return action def train(self, local_step): #critic -> transition -> reward -> encoder -> actor set1, set2 = self.buffer.dbc_sample(self.batch_size) s, a, r, ns, d = set1 s2, a2, r2, ns2, d2 = set2 target_action = tf.clip_by_value( self.target_actor(self.target_encoder(ns)) + tf.clip_by_value( tf.random.normal(shape=self.target_actor( self.target_encoder(ns)).shape, mean=0, stddev=self.target_noise), -self.noise_clip, self.noise_clip), -1, 1) target_value = tf.stop_gradient(r + self.gamma * (1 - d) * tf.minimum( self.target_critic1(self.target_encoder(ns), target_action), self.target_critic2(self.target_encoder(ns), target_action))) with tf.GradientTape(persistent=True) as tape1: critic1_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic1(self.encoder(s), a))) critic2_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic2(self.encoder(s), a))) critic1_grad = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_grad, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_grad = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_grad, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 #train dynamics with tf.GradientTape() as tape2: feature = self.encoder(s) next_feature = self.encoder(ns) mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1)) if (sigma[0][0].numpy() == 0): sigma = tf.ones_like(mu) diff = (mu - tf.stop_gradient(next_feature)) / sigma dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) + tf.math.log(sigma)) dynamics_gradients = tape2.gradient( dynamics_loss, self.encoder.trainable_variables + self.dynamics_model.trainable_variables) self.dynamics_optimizer.apply_gradients( zip( dynamics_gradients, self.encoder.trainable_variables + self.dynamics_model.trainable_variables)) #dynamics_gradients = tape2.gradient(dynamics_loss, self.dynamics_model.trainable_variables) #self.dynamics_optimizer.apply_gradients(zip(dynamics_gradients, self.dynamics_model.trainable_variables)) del tape2 #train reward with tf.GradientTape() as tape3: feature = self.encoder(s) sample_dynamics = self.dynamics_model.sample( tf.concat([feature, a], axis=1)) reward_prediction = self.reward_model(sample_dynamics) reward_loss = tf.reduce_mean(tf.square(reward_prediction - (r))) reward_gradients = tape3.gradient( reward_loss, self.encoder.trainable_variables + self.reward_model.trainable_variables) self.reward_optimizer.apply_gradients( zip( reward_gradients, self.encoder.trainable_variables + self.reward_model.trainable_variables)) #reward_gradients = tape3.gradient(reward_loss, self.reward_model.trainable_variables) #self.reward_optimizer.apply_gradients(zip(reward_gradients, self.reward_model.trainable_variables)) del tape3 #train encoder with tf.GradientTape() as tape4: feature1 = self.encoder(s) feature2 = self.encoder(s2) mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1)) mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2], axis=1)) z_dist = tf.abs(feature1 - feature2) r_dist = tf.abs(r - r2) transition_dist = tf.sqrt( tf.square(tf.abs(mu1 - mu2)) + tf.square(tf.abs(sigma1 - sigma2))) bisimilarity = tf.stop_gradient( tf.cast(r_dist, tf.float32) + self.gamma * tf.cast(transition_dist, tf.float32)) encoder_loss = self.bisim_coef * tf.reduce_mean( tf.square(z_dist - bisimilarity)) encoder_gradients = tape4.gradient(encoder_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) del tape4 if local_step % (self.policy_delay) == 0: with tf.GradientTape() as tape5: actor_loss = -tf.reduce_mean( self.critic1(tf.stop_gradient(self.encoder(s)), self.actor(tf.stop_gradient( self.encoder(s))))) actor_grad = tape5.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) del tape5 soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau)
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.actor_update = args.actor_update self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.dynamics_model = Transition_Network(self.feature_dim, action_dim, deterministic=False) self.reward_model = Reward_Network(self.feature_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.current_step = 0 self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder, 'Dynamics': self.dynamics_model, 'Reward': self.reward_model } self.name = 'DBC_SACv2'
class DBC_SACv2: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.actor_update = args.actor_update self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.dynamics_model = Transition_Network(self.feature_dim, action_dim, deterministic=False) self.reward_model = Reward_Network(self.feature_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr) self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr) self.current_step = 0 self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder, 'Dynamics': self.dynamics_model, 'Reward': self.reward_model } self.name = 'DBC_SACv2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature) action = action.numpy()[0] return action def eval_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature, deterministic=True) action = action.numpy()[0] return action def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 total_encoder_loss = 0 total_dynamics_loss = 0 total_reward_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 if self.current_step % self.actor_update == 0: with tf.GradientTape() as tape2: s_action, s_logpi = self.actor( tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -( tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) #alpha_loss = tf.reduce_mean(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) #train encoder with tf.GradientTape() as tape4: new_ids = np.arange(len(s)) np.random.shuffle(new_ids) s2 = tf.gather(s, new_ids) feature = self.encoder(s) #feature2 = tf.gather(feature, new_ids) feature2 = self.encoder(s2) reward = self.reward_model(tf.stop_gradient(feature)) #reward2 = tf.gather(reward, new_ids) reward2 = self.reward_model(tf.stop_gradient(feature2)) feature_action, _ = self.actor(tf.stop_gradient(feature), True) feature2_action, _ = self.actor(tf.stop_gradient(feature2), True) mu, sigma = self.dynamics_model(tf.stop_gradient(feature), feature_action) mu2, sigma2 = self.dynamics_model(tf.stop_gradient(feature2), feature2_action) z_dist = tf.reshape(tf.keras.losses.huber(feature, feature2), shape=[-1, 1]) r_dist = tf.reshape(tf.keras.losses.huber(reward, reward2), shape=[-1, 1]) transition_dist = tf.sqrt( tf.square(mu - mu2) + tf.square(sigma - sigma2)) bisimilarity = r_dist + self.gamma * transition_dist encoder_loss = tf.reduce_mean(tf.square(z_dist - bisimilarity)) encoder_gradients = tape4.gradient(encoder_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) #train dynamics with tf.GradientTape() as tape5: feature = self.encoder(s) mu, sigma = self.dynamics_model(feature, a) if (sigma[0][0].numpy() == 0): if self.dynamics_model.deterministic == False: print("error") sigma = tf.ones_like(mu) next_feature = self.encoder(ns) diff = (mu - tf.stop_gradient(next_feature)) / sigma dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) + tf.math.log(sigma)) dynamics_gradients = tape5.gradient( dynamics_loss, self.encoder.trainable_variables + self.dynamics_model.trainable_variables) self.dynamics_optimizer.apply_gradients( zip( dynamics_gradients, self.encoder.trainable_variables + self.dynamics_model.trainable_variables)) #train reward with tf.GradientTape() as tape6: feature = self.encoder(s) sample_dynamics = self.dynamics_model.sample(feature, a) reward_prediction = self.reward_model(sample_dynamics) reward_loss = tf.reduce_mean(tf.square(reward_prediction - r)) reward_gradients = tape6.gradient( reward_loss, self.encoder.trainable_variables + self.reward_model.trainable_variables) self.reward_optimizer.apply_gradients( zip( reward_gradients, self.encoder.trainable_variables + self.reward_model.trainable_variables)) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) if self.current_step % self.actor_update == 0: total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_encoder_loss += encoder_loss.numpy() loss_list.append(['Loss/Encoder', total_encoder_loss]) total_dynamics_loss += dynamics_loss.numpy() loss_list.append(['Loss/Dynamics', total_dynamics_loss]) total_reward_loss += reward_loss.numpy() loss_list.append(['Loss/Reward', total_reward_loss]) if self.current_step % self.actor_update == 0 and self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
class DDQN: def __init__(self, state_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = args.batch_size self.gamma = args.gamma self.lr = args.learning_rate self.epsilon = args.epsilon self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.copy_iter = args.copy_iter self.network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.target_network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) copy_weight(self.network, self.target_network) self.network_list = { 'Network': self.network, 'Target_Network': self.target_network } self.name = 'Double DQN' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) q_value = self.network(state, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] if np.random.random() < self.epsilon: return np.random.randint(low=0, high=self.action_dim) else: return best_action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) q_value = self.network(state, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] return best_action def train(self, training_num): total_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) q_value = tf.expand_dims(tf.argmax(self.network( ns, activation='linear'), axis=1, output_type=tf.int32), axis=1) q_value_one = tf.squeeze(tf.one_hot(q_value, depth=self.action_dim), axis=1) target_value = r + self.gamma * (1 - d) * tf.reduce_sum( self.target_network(ns, activation='linear') * q_value_one, axis=1, keepdims=True) target_value = tf.stop_gradient(target_value) with tf.GradientTape() as tape: selected_values = tf.reduce_sum( self.network(s, activation='linear') * tf.squeeze( tf.one_hot(tf.cast(a, tf.int32), self.action_dim), axis=1), axis=1, keepdims=True) loss = 0.5 * tf.math.reduce_mean( tf.square(target_value - selected_values)) variables = self.network.trainable_variables gradients = tape.gradient(loss, variables) self.optimizer.apply_gradients(zip(gradients, variables)) if self.current_step % self.copy_iter == 0: copy_weight(self.network, self.target_network) total_loss += loss.numpy() return [['Loss/Loss', total_loss]]
class ImageDQN: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) self.obs_dim = obs_dim self.action_dim = action_dim self.feature_dim = args.feature_dim self.batch_size = args.batch_size self.gamma = args.gamma self.learning_rate = args.learning_rate self.epsilon = args.epsilon self.training_start = args.training_start self.training_step = args.training_step self.current_step = 0 self.copy_iter = args.copy_iter self.layer_num = args.layer_num self.filter_num = args.filter_num self.network = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_network = Policy_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num, 'channels_last') self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num, 'channels_last') copy_weight(self.network, self.target_network) copy_weight(self.encoder, self.target_encoder) self.network_list = { 'Network': self.network, 'Target_Network': self.target_network } self.name = 'ImageDQN' def get_action(self, obs): if np.random.random() < self.epsilon: return np.random.randint(low=0, high=self.action_dim) else: obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) q_value = self.network(feature, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] return best_action def eval_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) q_value = self.network(feature, activation='linear').numpy() best_action = np.argmax(q_value, axis=1)[0] return best_action def train(self, training_num): total_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) target_q = tf.reduce_max(self.target_network( self.target_encoder(ns), activation='linear'), axis=1, keepdims=True) target_value = r + self.gamma * (1 - d) * target_q target_value = tf.stop_gradient(target_value) with tf.GradientTape() as tape: selected_values = tf.reduce_sum( self.network(self.encoder(s), activation='linear') * tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), self.action_dim), axis=1), axis=1, keepdims=True) loss = 0.5 * tf.reduce_mean( tf.square(target_value - selected_values)) gradients = tape.gradient( loss, self.encoder.trainable_variables + self.network.trainable_variables) self.optimizer.apply_gradients( zip( gradients, self.encoder.trainable_variables + self.network.trainable_variables)) if self.current_step % self.copy_iter == 0: copy_weight(self.network, self.target_network) copy_weight(self.encoder, self.target_encoder) total_loss += loss.numpy() del tape return [['Loss/Loss', total_loss]]
class RAD_SACv2: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = args.image_size self.pre_image_size = args.pre_image_size self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = {'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder} self.aug_funcs = {} self.aug_list = { 'crop': rad.crop, 'grayscale': rad.random_grayscale(), 'cutout': rad.random_cutout(), 'cutout_color': rad.random_cutout_color(), 'flip': rad.random_flip(), 'rotate': rad.random_rotation(), 'rand_conv': rad.random_convolution(), 'color_jitter': rad.random_color_jitter(), 'no_aug': rad.no_aug } for aug_name in args.data_augs.split('-'): assert aug_name in self.aug_list self.aug_funcs[aug_name] = self.aug_list[aug_name] self.name = 'RAD_SACv2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature) action = action.numpy()[0] return action def eval_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature, deterministic=True) action = action.numpy()[0] return action def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.rad_sample(self.batch_size, self.aug_funcs, self.pre_image_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum(self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient(r + self.gamma * (1 - d) * ( target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean(tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean(tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient(critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient(critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 with tf.GradientTape() as tape2: s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum(self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy) alpha_loss = tf.nn.compute_average_loss(alpha_loss) #alpha_loss = tf.reduce_mean(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients(zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = args.image_size self.pre_image_size = args.pre_image_size self.current_step = 0 self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha), trainable=True) self.target_entropy = -action_dim self.gamma = args.gamma self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.critic_update = args.critic_update self.training_start = args.training_start self.training_step = args.training_step self.train_alpha = args.train_alpha self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max) self.critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic1 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.target_critic2 = Q_network(self.feature_dim, self.action_dim, args.hidden_dim) self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr, beta_1=0.5) self.network_list = {'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder} self.aug_funcs = {} self.aug_list = { 'crop': rad.crop, 'grayscale': rad.random_grayscale(), 'cutout': rad.random_cutout(), 'cutout_color': rad.random_cutout_color(), 'flip': rad.random_flip(), 'rotate': rad.random_rotation(), 'rand_conv': rad.random_convolution(), 'color_jitter': rad.random_color_jitter(), 'no_aug': rad.no_aug } for aug_name in args.data_augs.split('-'): assert aug_name in self.aug_list self.aug_funcs[aug_name] = self.aug_list[aug_name] self.name = 'RAD_SACv2'
class CURL_SACv1: def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.gamma = args.gamma self.alpha = args.alpha self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.training_start = args.training_start self.training_step = args.training_step self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.actor = Squashed_Gaussian_Actor( self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max, kernel_initializer=tf.keras.initializers.orthogonal()) self.critic1 = Q_network( self.feature_dim, self.action_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.critic2 = Q_network( self.feature_dim, self.action_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.v_network = V_network( self.feature_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.target_v_network = V_network( self.feature_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.curl = CURL(self.feature_dim, self.curl_latent_dim) copy_weight(self.v_network, self.target_v_network) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.current_step = 0 self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_SACv1' def get_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature) action = action.numpy()[0] return action def eval_action(self, obs): if obs.shape[-1] != self.image_size: obs = center_crop_image(obs, self.image_size) obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action, _ = self.actor(feature, deterministic=True) action = action.numpy()[0] return action def train(self, training_step): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_v_loss = 0 total_cpc_loss = 0 loss_list = [] self.current_step += 1 s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample( self.batch_size, self.image_size) obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"] s_action, s_logpi = self.actor(self.encoder(s)) min_aq = tf.minimum(self.critic1(self.encoder(s), s_action), self.critic2(self.encoder(s), s_action)) target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi) with tf.GradientTape() as tape1: v_loss = 0.5 * tf.reduce_mean( tf.square( self.v_network(tf.stop_gradient(self.encoder(s))) - target_v)) v_gradients = tape1.gradient(v_loss, self.v_network.trainable_variables) self.v_network_optimizer.apply_gradients( zip(v_gradients, self.v_network.trainable_variables)) del tape1 target_q = tf.stop_gradient( r + self.gamma * (1 - d) * self.target_v_network(self.target_encoder(ns))) with tf.GradientTape(persistent=True) as tape2: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape2.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) critic2_gradients = tape2.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape2 with tf.GradientTape() as tape3: s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep) actor_gradients = tape3.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) soft_update(self.v_network, self.target_v_network, self.tau) with tf.GradientTape(persistent=True) as tape4: z_a = self.encoder(obs_anchor) z_pos = tf.stop_gradient(self.target_encoder(obs_pos)) logits = self.curl.compute_logits(z_a, z_pos) labels = tf.range(logits.shape[0], dtype='int64') cpc_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)) cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables) self.cpc_optimizer.apply_gradients( (zip(cpc_gradients, self.curl.trainable_variables))) encoder_gradients = tape4.gradient(cpc_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) soft_update(self.encoder, self.target_encoder, self.encoder_tau) del tape4 total_v_loss += v_loss.numpy() loss_list.append(['Loss/V', total_v_loss]) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_cpc_loss += cpc_loss.numpy() loss_list.append(['Loss/CPC', total_cpc_loss]) return loss_list
class SAC_v2: def __init__(self, state_dim, action_dim, hidden_dim=256, training_step=1, alpha=0.1, train_alpha=True, batch_size=128, buffer_size=1e6, tau=0.005, learning_rate=0.0003, gamma=0.99, reward_scale=1, training_start=500): self.buffer = Buffer(buffer_size) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.state_dim = state_dim self.action_dim = action_dim self.batch_size = batch_size self.tau = tau self.gamma = gamma self.reward_scale = reward_scale self.training_start = training_start self.training_step = training_step self.log_alpha = tf.Variable(np.log(alpha), dtype=tf.float32, trainable=True) self.target_entropy = -action_dim self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate) self.train_alpha = train_alpha self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(self.state_dim, self.action_dim, (hidden_dim, hidden_dim)) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'Target_Critic1': self.target_critic1, 'Target_Critic2': self.target_critic2 } self.name = 'SAC_v2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) action = self.actor(state).numpy()[0] return action def train(self, training_num): for i in range(training_num): s, a, r, ns, d = self.buffer.sample(self.batch_size) target_min_aq = tf.minimum(self.target_critic1(ns, self.actor(ns)), self.target_critic2(ns, self.actor(ns))) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * self.actor.log_pi(ns))) #critic training with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape1 #actor training with tf.GradientTape() as tape2: mu, sigma = self.actor.mu_sigma(s) output = mu + tf.random.normal(shape=mu.shape) * sigma min_aq_rep = tf.minimum(self.critic1(s, output), self.critic2(s, output)) actor_loss = tf.reduce_mean(self.alpha.numpy() * self.actor.log_pi(s) - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 #alpha(temperature) training if self.train_alpha == True: with tf.GradientTape() as tape3: alpha_loss = -(tf.exp(self.log_alpha) * (tf.stop_gradient( self.actor.log_pi(s) + self.target_entropy))) alpha_loss = tf.nn.compute_average_loss( alpha_loss) #from softlearning package alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) del tape3 soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau)
def __init__(self, obs_dim, action_dim, args): self.buffer = Buffer(args.buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.image_size = obs_dim[-1] self.gamma = args.gamma self.alpha = args.alpha self.batch_size = args.batch_size self.feature_dim = args.feature_dim self.curl_latent_dim = args.curl_latent_dim self.layer_num = args.layer_num self.filter_num = args.filter_num self.tau = args.tau self.encoder_tau = args.encoder_tau self.training_start = args.training_start self.training_step = args.training_step self.encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim, self.layer_num, self.filter_num) self.actor = Squashed_Gaussian_Actor( self.feature_dim, self.action_dim, args.hidden_dim, args.log_std_min, args.log_std_max, kernel_initializer=tf.keras.initializers.orthogonal()) self.critic1 = Q_network( self.feature_dim, self.action_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.critic2 = Q_network( self.feature_dim, self.action_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.v_network = V_network( self.feature_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.target_v_network = V_network( self.feature_dim, args.hidden_dim, kernel_initializer=tf.keras.initializers.orthogonal()) self.curl = CURL(self.feature_dim, self.curl_latent_dim) copy_weight(self.v_network, self.target_v_network) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr) self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr) self.cpc_optimizer = tf.keras.optimizers.Adam(args.cpc_lr) self.current_step = 0 self.network_list = { 'Actor': self.actor, 'Critic1': self.critic1, 'Critic2': self.critic2, 'V_network': self.v_network, 'Target_V_network': self.target_v_network, 'Curl': self.curl, 'Encoder': self.encoder, 'Target_Encoder': self.target_encoder } self.name = 'CURL_SACv1'
def __init__(self, obs_dim, action_dim, hidden_dim=512, gamma=0.99, learning_rate=0.001, batch_size=512, policy_delay=2, actor_noise=0.1, target_noise=0.2, noise_clip=0.5, buffer_size=1e6, feature_dim=50, layer_num=4, filter_num=32, tau=0.005, encoder_tau=0.005, bisim_coef=0.5, training_start=1000): self.buffer = Buffer(buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.gamma = gamma self.learning_rate = learning_rate self.batch_size = batch_size self.feature_dim = feature_dim self.layer_num = layer_num self.filter_num = filter_num self.tau = tau self.encoder_tau = encoder_tau self.bisim_coef = bisim_coef self.policy_delay = policy_delay self.actor_noise = actor_noise self.target_noise = target_noise self.noise_clip = noise_clip self.training_start = training_start self.actor = Policy_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_actor = Policy_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.target_encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.dynamics_model = Transition_Network(feature_dim, action_dim) self.reward_model = Reward_Network(feature_dim) copy_weight(self.actor, self.target_actor) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate) self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate) self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate) self.name = 'DBC_TD3'
class DBC_SACv2: def __init__(self, obs_dim, action_dim, hidden_dim=256, gamma=0.99, learning_rate=1e-5, batch_size=128, buffer_size=1e6, feature_dim=50, layer_num=4, filter_num=32, tau=0.005, encoder_tau=0.005, bisim_coef=0.5, training_start=1000, train_alpha=True, alpha=0.1): self.buffer = Buffer(buffer_size) self.obs_dim = obs_dim self.action_dim = action_dim self.log_alpha = tf.Variable(initial_value=tf.math.log(alpha), trainable=True) self.target_entropy = -action_dim self.hidden_dim = hidden_dim self.gamma = gamma self.learning_rate = learning_rate self.bisim_coef = bisim_coef self.batch_size = batch_size self.feature_dim = feature_dim self.layer_num = layer_num self.filter_num = filter_num self.tau = tau self.encoder_tau = encoder_tau self.training_start = training_start self.train_alpha = train_alpha self.actor = Squashed_Gaussian_Actor(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic1 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.target_critic2 = Q_network(feature_dim, action_dim, (hidden_dim, hidden_dim)) self.encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.target_encoder = PixelEncoder(self.obs_dim, feature_dim, layer_num, filter_num) self.dynamics_model = Transition_Network(feature_dim, action_dim, deterministic=False) self.reward_model = Reward_Network(feature_dim) copy_weight(self.critic1, self.target_critic1) copy_weight(self.critic2, self.target_critic2) copy_weight(self.encoder, self.target_encoder) self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate) self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate) self.encoder_optimizer = tf.keras.optimizers.Adam(learning_rate) self.log_alpha_optimizer = tf.keras.optimizers.Adam(10 * learning_rate) self.dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate) self.reward_optimizer = tf.keras.optimizers.Adam(learning_rate) self.name = 'DBC_SACv2' @property def alpha(self): return tf.exp(self.log_alpha) def get_action(self, obs): obs = np.expand_dims(np.array(obs), axis=0) feature = self.encoder(obs) action = self.actor(feature).numpy()[0] return action def train(self, local_step): set1, set2 = self.buffer.dbc_sample(self.batch_size) s, a, r, ns, d = set1 s2, a2, r2, ns2, d2 = set2 target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), self.actor(self.encoder(ns))), self.target_critic2(self.target_encoder(ns), self.actor(self.encoder(ns)))) target_q = tf.stop_gradient(r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * self.actor.log_pi(self.encoder(ns)))) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 #train dynamics(encoder used together) next_feature = self.encoder(ns) with tf.GradientTape() as tape2: feature = self.encoder(s) mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1)) if (sigma[0][0].numpy() == 0): if self.dynamics_model.deterministic == False: print("error") sigma = tf.ones_like(mu) diff = (mu - tf.stop_gradient(next_feature)) / sigma dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) + tf.math.log(sigma)) dynamics_gradients = tape2.gradient( dynamics_loss, self.encoder.trainable_variables + self.dynamics_model.trainable_variables) self.dynamics_optimizer.apply_gradients( zip( dynamics_gradients, self.encoder.trainable_variables + self.dynamics_model.trainable_variables)) del tape2 #train rewards(encoder used together) with tf.GradientTape() as tape3: feature = self.encoder(s) sample_dynamics = self.dynamics_model.sample( tf.concat([feature, a], axis=1)) reward_prediction = self.reward_model(sample_dynamics) reward_loss = tf.reduce_mean(tf.square(reward_prediction - r)) reward_gradients = tape3.gradient( reward_loss, self.encoder.trainable_variables + self.reward_model.trainable_variables) self.reward_optimizer.apply_gradients( zip( reward_gradients, self.encoder.trainable_variables + self.reward_model.trainable_variables)) del tape3 # train encoder with tf.GradientTape() as tape4: feature1 = self.encoder(s) feature2 = self.encoder(s2) mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1)) mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2], axis=1)) z_dist = tf.abs(feature1 - feature2) r_dist = tf.abs(r - r2) transition_dist = tf.sqrt( tf.square(tf.abs(mu1 - mu2)) + tf.square(tf.abs(sigma1 - sigma2))) bisimilarity = ( tf.cast(r_dist, tf.float32) + self.gamma * tf.cast(transition_dist, tf.float32)).numpy() encoder_loss = self.bisim_coef * tf.reduce_mean( tf.square(z_dist - bisimilarity)) encoder_gradients = tape4.gradient(encoder_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) del tape4 if local_step % 2 == 0: with tf.GradientTape() as tape5: mu, sigma = self.actor.mu_sigma( tf.stop_gradient(self.encoder(s))) output = mu + tf.random.normal(shape=mu.shape) * sigma min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), output), self.critic2(tf.stop_gradient(self.encoder(s)), output)) actor_loss = tf.reduce_mean( self.alpha.numpy() * self.actor.log_pi(tf.stop_gradient(self.encoder(s))) - min_aq_rep) actor_gradients = tape5.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape5 if self.train_alpha == True: with tf.GradientTape() as tape6: alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient( self.actor.log_pi(self.encoder(s)) + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) log_alpha_gradients = tape6.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape6 soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau)