def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn", create_vf=True, create_qf=True): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): # if self.feature_extraction == "cnn": # critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) # else: # critics_h = tf.layers.flatten(obs) critics_h = CnnMlpFeatureExtractor(obs) if create_vf: # Value function with tf.variable_scope('vf', reuse=reuse): vf_h = mlp(critics_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) value_fn = tf.layers.dense(vf_h, 1, name="vf") self.value_fn = value_fn # if create_qf and action.get_shape().as_list()[0] == critics_h.get_shape().as_list()[0]: if create_qf: # action = tf.Print(action, [tf.shape(action)], "action shape: ") # critics_h = tf.Print(critics_h, [tf.shape(critics_h)], "critics_h shape: ") # Concatenate preprocessed state and action qf_h = tf.concat([critics_h, action], axis=-1) # Double Q values to reduce overestimation with tf.variable_scope('qf1', reuse=reuse): qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.layers.dense(qf1_h, 1, name="qf1") with tf.variable_scope('qf2', reuse=reuse): qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.layers.dense(qf2_h, 1, name="qf2") self.qf1 = qf1 self.qf2 = qf2 return self.qf1, self.qf2, self.value_fn
def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: pi_h = tf.layers.flatten(obs) if len(self.layers) > 0: pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) master_W, master_b = get_aggregation_var(pi_h, name_scope='master', n_sources=self.n_sources, SDW=self.SDW, n_actions=self.n_actions, no_bias=self.no_bias) self.act_mu = mu_ = affine_transformation(self.sources_actions, master_W, master_b) # Important difference with SAC and other algo such as PPO: # the std depends on the state, so we cannot use stable_baselines.common.distribution log_std = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None, name='log_std') # OpenAI Variation to cap the standard deviation # activation = tf.tanh # for log_std # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # Original Implementation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) self.std = std = tf.exp(log_std) # Reparameterization trick pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std logp_pi = gaussian_likelihood(pi_, mu_, log_std) self.entropy = gaussian_entropy(log_std) # MISSING: reg params for log and mu # Apply squashing and account for it in the probabilty deterministic_policy, policy, logp_pi = apply_squashing_func( mu_, pi_, logp_pi) if isinstance(self.ac_space, gym.spaces.Box): policy = tf.clip_by_value(policy, self.ac_space.low + EPS, self.ac_space.high - EPS) deterministic_policy = tf.clip_by_value(deterministic_policy, self.ac_space.low + EPS, self.ac_space.high - EPS) self.policy = policy self.deterministic_policy = deterministic_policy return deterministic_policy, policy, logp_pi
def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn"): if obs is None: obs = self.processed_obs if self.obs_module_indices is not None: obs = tf.gather(obs, self.obs_module_indices["vf"], axis=-1) with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn" and self.cnn_vf: critics_h = self.cnn_extractor(obs, name="vf_c1", act_fun=self.activ_fn, **self.cnn_kwargs) else: critics_h = tf.layers.flatten(obs) # Concatenate preprocessed state and action qf_h = tf.concat([critics_h, action], axis=-1) # Double Q values to reduce overestimation with tf.variable_scope('qf1', reuse=reuse): qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.layers.dense(qf1_h, 1, name="qf1") with tf.variable_scope('qf2', reuse=reuse): qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.layers.dense(qf2_h, 1, name="qf2") self.qf1 = qf1 self.qf2 = qf2 # TODO: assumes that all qf1 and qf2 can never have opposite signs #self.q_discrepancy = tf.square(self.qf1 - self.qf2) / tf.square(tf.maximum(self.qf1, self.qf2)) self.q_discrepancy = tf.abs(self.qf1 - self.qf2) return self.qf1, self.qf2
def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs if self.obs_module_indices is not None: obs = tf.gather(obs, self.obs_module_indices["pi"], axis=-1) if self.policy is not None: with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": pi_h = self.cnn_extractor(obs, name="pi_c1", act_fun=self.activ_fn, **self.cnn_kwargs) else: pi_h = tf.layers.flatten(obs) pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) self.policy_t = policy = tf.layers.dense( pi_h, self.ac_space.shape[0], activation=tf.tanh) else: with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": pi_h = self.cnn_extractor(obs, name="pi_c1", act_fun=self.activ_fn, **self.cnn_kwargs) else: pi_h = tf.layers.flatten(obs) pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) self.policy = policy = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=tf.tanh) return policy
def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): # if self.feature_extraction == "cnn": # pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) # else: # pi_h = tf.layers.flatten(obs) pi_h = CnnMlpFeatureExtractor(obs) pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) self.act_mu = mu_ = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) # Important difference with SAC and other algo such as PPO: # the std depends on the state, so we cannot use stable_baselines.common.distribution log_std = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=None) # Regularize policy output (not used for now) # reg_loss = self.reg_weight * 0.5 * tf.reduce_mean(log_std ** 2) # reg_loss += self.reg_weight * 0.5 * tf.reduce_mean(mu ** 2) # self.reg_loss = reg_loss # OpenAI Variation to cap the standard deviation # activation = tf.tanh # for log_std # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # Original Implementation log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) self.std = std = tf.exp(log_std) # Reparameterization trick pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std logp_pi = gaussian_likelihood(pi_, mu_, log_std) self.entropy = gaussian_entropy(log_std) # MISSING: reg params for log and mu # Apply squashing and account for it in the probability deterministic_policy, policy, logp_pi = apply_squashing_func( mu_, pi_, logp_pi) self.policy = policy self.deterministic_policy = deterministic_policy return deterministic_policy, policy, logp_pi
def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: critics_h = tf.layers.flatten(obs) # Concatenate preprocessed state and action qf_h = tf.concat([critics_h, action], axis=-1) # Double Q values to reduce overestimation with tf.variable_scope('qf1', reuse=reuse): qf1_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.layers.dense(qf1_h, 1, name="qf1") with tf.variable_scope('qf2', reuse=reuse): qf2_h = mlp(qf_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.layers.dense(qf2_h, 1, name="qf2") self.qf1 = qf1 self.qf2 = qf2 return self.qf1, self.qf2
def make_actor(self, obs=None, reuse=False, scope="pi"): if obs is None: obs = self.processed_obs with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": pi_h = self.cnn_extractor(obs, **self.cnn_kwargs) else: pi_h = tf.layers.flatten(obs) pi_h = mlp(pi_h, self.layers, self.activ_fn, layer_norm=self.layer_norm) self.policy = policy = tf.layers.dense(pi_h, self.ac_space.shape[0], activation=tf.tanh) return policy
def make_critics(self, obs=None, action=None, reuse=False, scope="values_fn", create_vf=True, create_qf=True): if obs is None: obs = self.processed_obs # with tf.variable_scope("attention_critic", reuse=tf.AUTO_REUSE): # if self.feature_extraction == "attention_mlp": # latent = attention_mlp_extractor2(tf.layers.flatten(obs), n_object=self.n_object, n_units=128) with tf.variable_scope(scope, reuse=reuse): if self.feature_extraction == "cnn": critics_h = self.cnn_extractor(obs, **self.cnn_kwargs) # elif self.feature_extraction == "attention_mlp": # critics_h = latent else: critics_h = tf.layers.flatten(obs) if create_vf: # Value function with tf.variable_scope('vf', reuse=reuse): critics_latent = critics_h if self.feature_extraction == "attention_mlp": with tf.variable_scope("attention", reuse=reuse): critics_latent = attention_mlp_extractor2( critics_h, n_object=self.n_object, n_units=128) elif self.feature_extraction == "attention_mlp_particle": with tf.variable_scope("attention", reuse=reuse): critics_latent = attention_mlp_extractor_particle( critics_h, n_object=3, n_units=128) vf_h = mlp(critics_latent, self.critic_layers, self.activ_fn, layer_norm=self.layer_norm) value_fn = tf.layers.dense(vf_h, 1, name="vf") self.value_fn = value_fn if create_qf: # Concatenate preprocessed state and action qf_h = tf.concat([critics_h, action], axis=-1) # Double Q values to reduce overestimation with tf.variable_scope('qf1', reuse=reuse): qf1_h = qf_h if self.feature_extraction == "attention_mlp": with tf.variable_scope("attention", reuse=reuse): qf1_h = attention_mlp_extractor2( qf_h, n_object=self.n_object, n_units=128, has_action=True) elif self.feature_extraction == "attention_mlp_particle": with tf.variable_scope("attention", reuse=reuse): qf1_h = attention_mlp_extractor_particle( qf_h, n_object=3, n_units=128, has_action=True) qf1_h = mlp(qf1_h, self.critic_layers, self.activ_fn, layer_norm=self.layer_norm) qf1 = tf.layers.dense(qf1_h, 1, name="qf1") with tf.variable_scope('qf2', reuse=reuse): qf2_h = qf_h if self.feature_extraction == "attention_mlp": with tf.variable_scope("attention", reuse=reuse): qf2_h = attention_mlp_extractor2( qf_h, n_object=self.n_object, n_units=128, has_action=True) elif self.feature_extraction == "attention_mlp_particle": with tf.variable_scope("attention", reuse=reuse): qf2_h = attention_mlp_extractor_particle( qf_h, n_object=3, n_units=128, has_action=True) qf2_h = mlp(qf2_h, self.critic_layers, self.activ_fn, layer_norm=self.layer_norm) qf2 = tf.layers.dense(qf2_h, 1, name="qf2") self.qf1 = qf1 self.qf2 = qf2 return self.qf1, self.qf2, self.value_fn