예제 #1
0
    def make_critics(self,
                     obs=None,
                     action=None,
                     reuse=False,
                     scope="values_fn",
                     create_vf=True,
                     create_qf=True):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            # if self.feature_extraction == "cnn":
            #     critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            # else:
            #     critics_h = tf.layers.flatten(obs)
            critics_h = CnnMlpFeatureExtractor(obs)
            if create_vf:
                # Value function
                with tf.variable_scope('vf', reuse=reuse):
                    vf_h = mlp(critics_h,
                               self.layers,
                               self.activ_fn,
                               layer_norm=self.layer_norm)
                    value_fn = tf.layers.dense(vf_h, 1, name="vf")
                self.value_fn = value_fn

            # if create_qf and action.get_shape().as_list()[0] == critics_h.get_shape().as_list()[0]:
            if create_qf:

                # action = tf.Print(action, [tf.shape(action)], "action shape: ")
                # critics_h = tf.Print(critics_h, [tf.shape(critics_h)], "critics_h shape: ")

                # Concatenate preprocessed state and action
                qf_h = tf.concat([critics_h, action], axis=-1)

                # Double Q values to reduce overestimation
                with tf.variable_scope('qf1', reuse=reuse):
                    qf1_h = mlp(qf_h,
                                self.layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    qf1 = tf.layers.dense(qf1_h, 1, name="qf1")

                with tf.variable_scope('qf2', reuse=reuse):
                    qf2_h = mlp(qf_h,
                                self.layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    qf2 = tf.layers.dense(qf2_h, 1, name="qf2")

                self.qf1 = qf1
                self.qf2 = qf2

        return self.qf1, self.qf2, self.value_fn
예제 #2
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                pi_h = tf.layers.flatten(obs)

            if len(self.layers) > 0:
                pi_h = mlp(pi_h,
                           self.layers,
                           self.activ_fn,
                           layer_norm=self.layer_norm)

            master_W, master_b = get_aggregation_var(pi_h,
                                                     name_scope='master',
                                                     n_sources=self.n_sources,
                                                     SDW=self.SDW,
                                                     n_actions=self.n_actions,
                                                     no_bias=self.no_bias)

            self.act_mu = mu_ = affine_transformation(self.sources_actions,
                                                      master_W, master_b)

            # Important difference with SAC and other algo such as PPO:
            # the std depends on the state, so we cannot use stable_baselines.common.distribution
            log_std = tf.layers.dense(pi_h,
                                      self.ac_space.shape[0],
                                      activation=None,
                                      name='log_std')

        # OpenAI Variation to cap the standard deviation
        # activation = tf.tanh # for log_std
        # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
        # Original Implementation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        self.std = std = tf.exp(log_std)
        # Reparameterization trick
        pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std
        logp_pi = gaussian_likelihood(pi_, mu_, log_std)
        self.entropy = gaussian_entropy(log_std)
        # MISSING: reg params for log and mu
        # Apply squashing and account for it in the probabilty
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            mu_, pi_, logp_pi)

        if isinstance(self.ac_space, gym.spaces.Box):
            policy = tf.clip_by_value(policy, self.ac_space.low + EPS,
                                      self.ac_space.high - EPS)
            deterministic_policy = tf.clip_by_value(deterministic_policy,
                                                    self.ac_space.low + EPS,
                                                    self.ac_space.high - EPS)

        self.policy = policy
        self.deterministic_policy = deterministic_policy

        return deterministic_policy, policy, logp_pi
예제 #3
0
    def make_critics(self,
                     obs=None,
                     action=None,
                     reuse=False,
                     scope="values_fn"):
        if obs is None:
            obs = self.processed_obs

        if self.obs_module_indices is not None:
            obs = tf.gather(obs, self.obs_module_indices["vf"], axis=-1)

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn" and self.cnn_vf:
                critics_h = self.cnn_extractor(obs,
                                               name="vf_c1",
                                               act_fun=self.activ_fn,
                                               **self.cnn_kwargs)
            else:
                critics_h = tf.layers.flatten(obs)

            # Concatenate preprocessed state and action
            qf_h = tf.concat([critics_h, action], axis=-1)

            # Double Q values to reduce overestimation
            with tf.variable_scope('qf1', reuse=reuse):
                qf1_h = mlp(qf_h,
                            self.layers,
                            self.activ_fn,
                            layer_norm=self.layer_norm)
                qf1 = tf.layers.dense(qf1_h, 1, name="qf1")

            with tf.variable_scope('qf2', reuse=reuse):
                qf2_h = mlp(qf_h,
                            self.layers,
                            self.activ_fn,
                            layer_norm=self.layer_norm)
                qf2 = tf.layers.dense(qf2_h, 1, name="qf2")

            self.qf1 = qf1
            self.qf2 = qf2
            # TODO: assumes that all qf1 and qf2 can never have opposite signs
            #self.q_discrepancy = tf.square(self.qf1 - self.qf2) / tf.square(tf.maximum(self.qf1, self.qf2))
            self.q_discrepancy = tf.abs(self.qf1 - self.qf2)

        return self.qf1, self.qf2
예제 #4
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        if self.obs_module_indices is not None:
            obs = tf.gather(obs, self.obs_module_indices["pi"], axis=-1)

        if self.policy is not None:
            with tf.variable_scope(scope, reuse=reuse):
                if self.feature_extraction == "cnn":
                    pi_h = self.cnn_extractor(obs,
                                              name="pi_c1",
                                              act_fun=self.activ_fn,
                                              **self.cnn_kwargs)
                else:
                    pi_h = tf.layers.flatten(obs)

                pi_h = mlp(pi_h,
                           self.layers,
                           self.activ_fn,
                           layer_norm=self.layer_norm)

                self.policy_t = policy = tf.layers.dense(
                    pi_h, self.ac_space.shape[0], activation=tf.tanh)
        else:
            with tf.variable_scope(scope, reuse=reuse):
                if self.feature_extraction == "cnn":
                    pi_h = self.cnn_extractor(obs,
                                              name="pi_c1",
                                              act_fun=self.activ_fn,
                                              **self.cnn_kwargs)
                else:
                    pi_h = tf.layers.flatten(obs)

                pi_h = mlp(pi_h,
                           self.layers,
                           self.activ_fn,
                           layer_norm=self.layer_norm)

                self.policy = policy = tf.layers.dense(pi_h,
                                                       self.ac_space.shape[0],
                                                       activation=tf.tanh)

        return policy
예제 #5
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            # if self.feature_extraction == "cnn":
            #     pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            # else:
            #     pi_h = tf.layers.flatten(obs)

            pi_h = CnnMlpFeatureExtractor(obs)

            pi_h = mlp(pi_h,
                       self.layers,
                       self.activ_fn,
                       layer_norm=self.layer_norm)

            self.act_mu = mu_ = tf.layers.dense(pi_h,
                                                self.ac_space.shape[0],
                                                activation=None)
            # Important difference with SAC and other algo such as PPO:
            # the std depends on the state, so we cannot use stable_baselines.common.distribution
            log_std = tf.layers.dense(pi_h,
                                      self.ac_space.shape[0],
                                      activation=None)

        # Regularize policy output (not used for now)
        # reg_loss = self.reg_weight * 0.5 * tf.reduce_mean(log_std ** 2)
        # reg_loss += self.reg_weight * 0.5 * tf.reduce_mean(mu ** 2)
        # self.reg_loss = reg_loss

        # OpenAI Variation to cap the standard deviation
        # activation = tf.tanh # for log_std
        # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
        # Original Implementation
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)

        self.std = std = tf.exp(log_std)
        # Reparameterization trick
        pi_ = mu_ + tf.random_normal(tf.shape(mu_)) * std
        logp_pi = gaussian_likelihood(pi_, mu_, log_std)
        self.entropy = gaussian_entropy(log_std)
        # MISSING: reg params for log and mu
        # Apply squashing and account for it in the probability
        deterministic_policy, policy, logp_pi = apply_squashing_func(
            mu_, pi_, logp_pi)
        self.policy = policy
        self.deterministic_policy = deterministic_policy

        return deterministic_policy, policy, logp_pi
예제 #6
0
    def make_critics(self,
                     obs=None,
                     action=None,
                     reuse=False,
                     scope="values_fn"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                critics_h = tf.layers.flatten(obs)

            # Concatenate preprocessed state and action
            qf_h = tf.concat([critics_h, action], axis=-1)

            # Double Q values to reduce overestimation
            with tf.variable_scope('qf1', reuse=reuse):
                qf1_h = mlp(qf_h,
                            self.layers,
                            self.activ_fn,
                            layer_norm=self.layer_norm)
                qf1 = tf.layers.dense(qf1_h, 1, name="qf1")

            with tf.variable_scope('qf2', reuse=reuse):
                qf2_h = mlp(qf_h,
                            self.layers,
                            self.activ_fn,
                            layer_norm=self.layer_norm)
                qf2 = tf.layers.dense(qf2_h, 1, name="qf2")

            self.qf1 = qf1
            self.qf2 = qf2

        return self.qf1, self.qf2
예제 #7
0
    def make_actor(self, obs=None, reuse=False, scope="pi"):
        if obs is None:
            obs = self.processed_obs

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            else:
                pi_h = tf.layers.flatten(obs)

            pi_h = mlp(pi_h,
                       self.layers,
                       self.activ_fn,
                       layer_norm=self.layer_norm)

            self.policy = policy = tf.layers.dense(pi_h,
                                                   self.ac_space.shape[0],
                                                   activation=tf.tanh)

        return policy
    def make_critics(self,
                     obs=None,
                     action=None,
                     reuse=False,
                     scope="values_fn",
                     create_vf=True,
                     create_qf=True):
        if obs is None:
            obs = self.processed_obs

        # with tf.variable_scope("attention_critic", reuse=tf.AUTO_REUSE):
        #     if self.feature_extraction == "attention_mlp":
        #         latent = attention_mlp_extractor2(tf.layers.flatten(obs), n_object=self.n_object, n_units=128)

        with tf.variable_scope(scope, reuse=reuse):
            if self.feature_extraction == "cnn":
                critics_h = self.cnn_extractor(obs, **self.cnn_kwargs)
            # elif self.feature_extraction == "attention_mlp":
            #     critics_h = latent
            else:
                critics_h = tf.layers.flatten(obs)

            if create_vf:
                # Value function
                with tf.variable_scope('vf', reuse=reuse):
                    critics_latent = critics_h
                    if self.feature_extraction == "attention_mlp":
                        with tf.variable_scope("attention", reuse=reuse):
                            critics_latent = attention_mlp_extractor2(
                                critics_h, n_object=self.n_object, n_units=128)
                    elif self.feature_extraction == "attention_mlp_particle":
                        with tf.variable_scope("attention", reuse=reuse):
                            critics_latent = attention_mlp_extractor_particle(
                                critics_h, n_object=3, n_units=128)
                    vf_h = mlp(critics_latent,
                               self.critic_layers,
                               self.activ_fn,
                               layer_norm=self.layer_norm)
                    value_fn = tf.layers.dense(vf_h, 1, name="vf")
                self.value_fn = value_fn

            if create_qf:
                # Concatenate preprocessed state and action
                qf_h = tf.concat([critics_h, action], axis=-1)

                # Double Q values to reduce overestimation
                with tf.variable_scope('qf1', reuse=reuse):
                    qf1_h = qf_h
                    if self.feature_extraction == "attention_mlp":
                        with tf.variable_scope("attention", reuse=reuse):
                            qf1_h = attention_mlp_extractor2(
                                qf_h,
                                n_object=self.n_object,
                                n_units=128,
                                has_action=True)
                    elif self.feature_extraction == "attention_mlp_particle":
                        with tf.variable_scope("attention", reuse=reuse):
                            qf1_h = attention_mlp_extractor_particle(
                                qf_h, n_object=3, n_units=128, has_action=True)
                    qf1_h = mlp(qf1_h,
                                self.critic_layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    qf1 = tf.layers.dense(qf1_h, 1, name="qf1")

                with tf.variable_scope('qf2', reuse=reuse):
                    qf2_h = qf_h
                    if self.feature_extraction == "attention_mlp":
                        with tf.variable_scope("attention", reuse=reuse):
                            qf2_h = attention_mlp_extractor2(
                                qf_h,
                                n_object=self.n_object,
                                n_units=128,
                                has_action=True)
                    elif self.feature_extraction == "attention_mlp_particle":
                        with tf.variable_scope("attention", reuse=reuse):
                            qf2_h = attention_mlp_extractor_particle(
                                qf_h, n_object=3, n_units=128, has_action=True)
                    qf2_h = mlp(qf2_h,
                                self.critic_layers,
                                self.activ_fn,
                                layer_norm=self.layer_norm)
                    qf2 = tf.layers.dense(qf2_h, 1, name="qf2")

                self.qf1 = qf1
                self.qf2 = qf2

        return self.qf1, self.qf2, self.value_fn