def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: pi_h = tf.layers.flatten(obs) if len(self.layers) > 0: pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) master_W, master_b = get_aggregation_var(pi_h, name_scope='master', n_sources=self.n_sources, SDW=self.SDW, n_actions=self.n_actions, no_bias=self.no_bias) self.act_mu = mu_ = affine_transformation(self.sources_actions, master_W, master_b) # Important difference with SAC and other algo such as PPO: # the std depends on the state, so we cannot use stable_baselines.common.distribution log_std = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None, name='log_std') # OpenAI Variation to cap the standard deviation # activation = tf.tanh # for log_std # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # Original Implementation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) self.std = std = tf.exp(log_std) # Reparameterization trick pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std logp_pi = gaussian_likelihood(pi_, mu_, log_std) self.entropy = gaussian_entropy(log_std) # MISSING: reg params for log and mu # Apply squashing and account for it in the probabilty deterministic_policy, policy, logp_pi = apply_squashing_func( mu_, pi_, logp_pi) if isinstance(self.ac_space, gym.spaces.Box): policy = tf.clip_by_value(policy, self.ac_space.low + EPS, self.ac_space.high - EPS) deterministic_policy = tf.clip_by_value(deterministic_policy, self.ac_space.low + EPS, self.ac_space.high - EPS) self.policy = policy self.deterministic_policy = deterministic_policy return deterministic_policy, policy, logp_pi
def logpac(self, action): from stable_baselines.sac.policies import gaussian_likelihood, EPS act_mu = self.policy_tf.act_mu log_std = tf.log(self.policy_tf.std) # Potentially we need to clip atanh and pass gradient log_u = gaussian_likelihood( tf.atanh(tf.clip_by_value(action, -0.99, 0.99)), act_mu, log_std) log_ac = log_u - tf.reduce_sum(tf.log(1 - action**2 + EPS), axis=1) return log_ac
def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): # if self.feature_extraction == "cnn": # pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) # else: # pi_h = tf.layers.flatten(obs) pi_h = CnnMlpFeatureExtractor(obs) pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) self.act_mu = mu_ = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) # Important difference with SAC and other algo such as PPO: # the std depends on the state, so we cannot use stable_baselines.common.distribution log_std = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) # Regularize policy output (not used for now) # reg_loss = self.reg_weight * 0.5 * tf.reduce_mean(log_std ** 2) # reg_loss += self.reg_weight * 0.5 * tf.reduce_mean(mu ** 2) # self.reg_loss = reg_loss # OpenAI Variation to cap the standard deviation # activation = tf.tanh # for log_std # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # Original Implementation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) self.std = std = tf.exp(log_std) # Reparameterization trick pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std logp_pi = gaussian_likelihood(pi_, mu_, log_std) self.entropy = gaussian_entropy(log_std) # MISSING: reg params for log and mu # Apply squashing and account for it in the probability deterministic_policy, policy, logp_pi = apply_squashing_func( mu_, pi_, logp_pi) self.policy = policy self.deterministic_policy = deterministic_policy return deterministic_policy, policy, logp_pi