示例#1
0
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        with tf.variable_scope(self.name):
            # build the actual policy network
            self.obs_var, self.mean_var = create_mlp(name='mean_network',
                                                     output_dim=self.action_dim,
                                                     hidden_sizes=self.hidden_sizes,
                                                     hidden_nonlinearity=self.hidden_nonlinearity,
                                                     output_nonlinearity=self.output_nonlinearity,
                                                     input_dim=(None, self.obs_dim,)
                                                     )

            with tf.variable_scope("log_std_network"):
                log_std_var = tf.get_variable(name='log_std_var',
                                              shape=(1, self.action_dim,),
                                              dtype=tf.float32,
                                              initializer=tf.constant_initializer(self.init_log_std),
                                              trainable=self.learn_std
                                              )

                self.log_std_var = tf.maximum(log_std_var, self.min_log_std, name='log_std')

            # symbolically define sampled action and distribution
            self.action_var = self.mean_var + tf.random_normal(shape=tf.shape(self.mean_var)) * tf.exp(log_std_var)
            self._dist = DiagonalGaussian(self.action_dim)

            # save the policy's trainable variables in dicts
            current_scope = tf.get_default_graph().get_name_scope()
            trainable_policy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
            self.policy_params = OrderedDict([(remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars])
示例#2
0
    def build_graph(self):
        with tf.variable_scope(self.name):
            # build the actual policy network
            self.obs_var, self.prob_var = create_mlp(
                name='prob_network',
                output_dim=self.action_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_dim=(
                    None,
                    self.obs_dim,
                ))

            # symbolically define sampled action and distribution
            self.action_var = tf.random.categorical(tf.log(self.prob_var), 1)
            self._dist = Categorical(self.action_dim)

            # save the policy's trainable variables in dicts
            current_scope = tf.get_default_graph().get_name_scope()
            trainable_policy_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
            self.policy_params = OrderedDict([
                (remove_scope_from_name(var.name, current_scope), var)
                for var in trainable_policy_vars
            ])
示例#3
0
    def distribution_info_sym(self, obs_var, params=None):
        """
        Return the symbolic distribution information about the actions.

        Args:
            obs_var (placeholder) : symbolic variable for observations
            params (dict) : a dictionary of placeholders or vars with the parameters of the MLP

        Returns:
            (dict) : a dictionary of tf placeholders for the policy output distribution
        """
        if params is None:
            with tf.variable_scope(self.name):
                obs_var, mean_var = create_mlp(
                    name='mean_network',
                    output_dim=self.action_dim,
                    hidden_sizes=self.hidden_sizes,
                    hidden_nonlinearity=self.hidden_nonlinearity,
                    output_nonlinearity=self.output_nonlinearity,
                    input_var=obs_var,
                    reuse=True,
                )
                log_std_var = self.log_std_var
        else:
            mean_network_params = OrderedDict()
            log_std_network_params = []
            for name, param in params.items():
                if 'log_std_network' in name:
                    log_std_network_params.append(param)
                else:  # if 'mean_network' in name:
                    mean_network_params[name] = param

            assert len(log_std_network_params) == 1
            obs_var, mean_var = forward_mlp(
                output_dim=self.action_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_var=obs_var,
                mlp_params=mean_network_params,
            )

            log_std_var = log_std_network_params[0]

        return dict(mean=mean_var, log_std=log_std_var)
示例#4
0
    def distribution_info_sym(self, obs_var, params=None):
        """
        Return the symbolic distribution information about the actions.

        Args:
            obs_var (placeholder) : symbolic variable for observations
            params (dict) : a dictionary of placeholders or vars with the parameters of the MLP

        Returns:
            (dict) : a dictionary of tf placeholders for the policy output distribution
        """
        if params is None:
            with tf.variable_scope(self.name):
                obs_var, prob_var = create_mlp(
                    name='prob_network',
                    output_dim=self.action_dim,
                    hidden_sizes=self.hidden_sizes,
                    hidden_nonlinearity=self.hidden_nonlinearity,
                    output_nonlinearity=self.output_nonlinearity,
                    input_var=obs_var,
                    reuse=True,
                )
        else:
            prob_network_params = OrderedDict()
            for name, param in params.items():
                prob_network_params[name] = param

            obs_var, prob_var = forward_mlp(
                output_dim=self.action_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_var=obs_var,
                mlp_params=prob_network_params,
            )

        return dict(prob=prob_var)