def init_actor_loss(self): if self.policy_type == "GaussianPolicy": mu, std = self.policy.mu_and_std(self.states_ph) norm_dist = tfd.Normal(loc=mu, scale=std) entropy = norm_dist.entropy() log_actions = norm_dist.log_prob(tf_v1.atanh(self.actions_ph)) log_actions -= tf_v1.log(1.0 - self.actions_ph**2 + 1e-8) log_actions = tf_v1.reduce_sum(log_actions, axis=-1, keepdims=True) # log_actions = self.policy.log_prob(self.states_ph, self.actions_ph) elif self.policy_type == "DiscretePolicy": action_probs = self.policy.model(self.states_ph) entropy = -tf_v1.reduce_sum( tf_v1.multiply(tf_v1.log(action_probs), action_probs), axis=-1) hot_encoded = tf_v1.one_hot(self.actions_ph, self.action_size) log_actions = tf_v1.log( tf_v1.reduce_sum(hot_encoded * action_probs, axis=-1)) else: raise NotImplementedError( f"Received {self.policy_type}. This should have never happened!" ) log_loss = -log_actions * self.targets_ph entropy_loss = -self.alpha * entropy loss = log_loss + entropy_loss optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.policy.lr_ph) train_op = get_clipped_train_op(loss, optimizer, var_list=self.policy.trainable_vars, clip_norm=self.policy.clip_norm) self.policy.setup_loss(loss, train_op)
def init_critics_loss(self): q_targets = tf_v1.stop_gradient(self.get_q_targets()) for critic in self.critics: q_predictions = critic([self.states_ph, self.actions_ph]) q_loss = tf_v1.reduce_mean(tf_v1.losses.huber_loss(labels=q_targets, predictions=q_predictions)) # q_loss = tf_v1.losses.mean_squared_error(labels=q_targets, predictions=q_predictions, weights=0.5) optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.q_lr_ph) train_op = get_clipped_train_op(q_loss, optimizer=optimizer, var_list=critic.trainable_vars, clip_norm=self.clip_norm) critic.setup_loss(q_loss, train_op)
def init_actor_loss(self): actions = self.policy.model(self.states_ph) q_value = self.critic([self.states_ph, actions]) loss = -tf_v1.reduce_mean(q_value) optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.policy.lr_ph) train_op = get_clipped_train_op(loss, optimizer, var_list=self.policy.trainable_vars, clip_norm=self.policy.clip_norm) self.policy.setup_loss(loss, train_op)
def init_q(self): q_targets = self.get_q_target() q_predictions = self.q_net(self.states_ph) batch_size = tf_v1.shape(self.actions_ph)[0] indices = tf_v1.stack([tf_v1.range(batch_size), self.actions_ph], axis=-1) # From the q_predictions, only change the Q values of given action index to the target value q_targets = tf_v1.tensor_scatter_nd_update(q_predictions, indices, q_targets) q_loss = tf_v1.losses.mean_squared_error(labels=q_targets, predictions=q_predictions) optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.lr_ph) train_op = get_clipped_train_op(q_loss, optimizer=optimizer, var_list=self.q_net.trainable_vars, clip_norm=self.clip_norm) self.q_net.setup_loss(q_loss, train_op)
def init_actor(self): actions = self.policy.model(self.states_ph) q_values = tuple(Q([self.states_ph, actions]) for Q in self.critics) min_q = tf_v1.reduce_min(q_values, axis=0) log_actions = self.policy.log_prob(self.states_ph, actions) kl_loss = min_q - self.alpha_tf*log_actions policy_loss = -tf_v1.reduce_mean(kl_loss) optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.policy.lr_ph, name="policy_optimizer") actor_train_op = get_clipped_train_op(policy_loss, optimizer, var_list=self.policy.trainable_vars, clip_norm=self.policy.clip_norm) self.policy.setup_loss(policy_loss, actor_train_op) if self.auto_ent: alpha_loss = -self.alpha_tf*tf_v1.stop_gradient(log_actions + self.target_entropy) self.alpha_loss_tf = tf_v1.reduce_mean(alpha_loss) optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.alpha_lr_ph, name="alpha_optimizer") self.alpha_train_op = optimizer.minimize(self.alpha_loss_tf, var_list=[self.log_alpha_tf])
def init_q(self): q_targets = self.get_q_target() batch_size = tf_v1.shape(self.actions_ph)[0] indices = tf_v1.stack([tf_v1.range(batch_size), self.actions_ph], axis=-1) q_predictions = self.q_net(self.states_ph) q_targets = tf_v1.tensor_scatter_nd_update(q_predictions, indices, q_targets) # q_predictions = tf_v1.gather_nd(q_predictions, indices) # q_loss = tf_v1.reduce_mean(tf_v1.losses.huber_loss(labels=q_targets, predictions=q_predictions)) q_loss = tf_v1.losses.mean_squared_error(labels=q_targets, predictions=q_predictions) optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.lr_ph) train_op = get_clipped_train_op(q_loss, optimizer=optimizer, var_list=self.q_net.trainable_vars, clip_norm=self.clip_norm) self.q_net.setup_loss(q_loss, train_op)
def init_critic(self): loss = tf_v1.losses.mean_squared_error(labels=self.targets_ph, predictions=self.critic.output) optimizer = tf_v1.train.AdamOptimizer(learning_rate=self.lr_ph) train_op = get_clipped_train_op(loss, optimizer=optimizer, var_list=self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic.setup_loss(loss, train_op)