示例#1
0
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            # build the actual policy network

            self.input_var, self.cnn_output_var = create_cnn(
                name='cnn',
                hidden_nonlinearity=self.hidden_nonlinearity,
                kernel_sizes=self.kernel_sizes,
                strides=self.strides,
                num_filters=self.num_filters,
                input_dim=(None, ) + self.input_dim,
                input_var=self.input_var,
            )
            _, self.output_var = create_mlp(
                name='mlp',
                output_dim=self.output_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_var=self.cnn_output_var,
                batch_normalization=self.batch_normalization,
            )

        current_scope = tf.get_default_graph().get_name_scope()
        trainable_policy_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
        self._params = OrderedDict([
            (remove_scope_from_name(var.name, current_scope), var)
            for var in trainable_policy_vars
        ])
示例#2
0
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            # build the actual policy network
            args = create_rnn(
                name='rnn',
                cell_type=self._cell_type,
                output_dim=self.output_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_dim=(
                    None,
                    None,
                    self.input_dim,
                ),
                input_var=self.input_var,
                state_var=self.state_var,
            )

            self.input_var, self.state_var, self.output_var, self.next_state_var, self.cell = args

        current_scope = tf.get_default_graph().get_name_scope()
        trainable_policy_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
        self._params = OrderedDict([
            (remove_scope_from_name(var.name, current_scope), var)
            for var in trainable_policy_vars
        ])
示例#3
0
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        # with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
        # build the actual policy network
        self.input_var, self.output_var = create_mlp(
            name='mlp',
            output_dim=self.output_dim,
            hidden_sizes=self.hidden_sizes,
            hidden_nonlinearity=self.hidden_nonlinearity,
            output_nonlinearity=self.output_nonlinearity,
            input_dim=(
                None,
                self.input_dim,
            ),
            input_var=self.input_var,
            batch_normalization=self.batch_normalization,
        )

        # save the policy's trainable variables in dicts
        # current_scope = tf.get_default_graph().get_name_scope()
        current_scope = self.name
        trainable_policy_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
        self._params = OrderedDict([
            (remove_scope_from_name(var.name, current_scope), var)
            for var in trainable_policy_vars
        ])
示例#4
0
 def _create_placeholders_for_vars(self, scope, graph_keys=tf.GraphKeys.TRAINABLE_VARIABLES):
     var_list = tf.get_collection(graph_keys, scope=scope)
     placeholders = []
     for var in var_list:
         var_name = remove_scope_from_name(var.name, scope.split('/')[0])
         placeholders.append((var_name, tf.placeholder(tf.float32, shape=var.shape, name="%s_ph" % var_name)))
     return OrderedDict(placeholders)
示例#5
0
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            # build the actual policy network
            self.obs_var, self.mean_var = create_mlp(
                name='mean_network',
                output_dim=self.action_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_dim=(
                    None,
                    self.obs_dim,
                ),
            )

            with tf.variable_scope("log_std_network", reuse=tf.AUTO_REUSE):
                log_std_var = tf.get_variable(
                    name='log_std_var',
                    shape=(
                        1,
                        self.action_dim,
                    ),
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(self.init_log_std),
                    trainable=self.learn_std,
                )

                self.log_std_var = tf.maximum(log_std_var,
                                              self.min_log_std,
                                              name='log_std')

            # symbolically define sampled action and distribution
            self.action_var = self.mean_var + tf.random_normal(
                shape=tf.shape(self.mean_var)) * tf.exp(log_std_var)
            self._dist = DiagonalGaussian(self.action_dim)

            # save the policy's trainable variables in dicts
            # current_scope = tf.get_default_graph().get_name_scope()
            current_scope = self.name
            trainable_policy_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
            self.policy_params = OrderedDict([
                (remove_scope_from_name(var.name, current_scope), var)
                for var in trainable_policy_vars
            ])

            self.policy_params_ph = self._create_placeholders_for_vars(
                scope=self.name + "/mean_network")
            log_std_network_phs = self._create_placeholders_for_vars(
                scope=self.name + "/log_std_network")
            self.policy_params_ph.update(log_std_network_phs)
            self.policy_params_keys = self.policy_params_ph.keys()
示例#6
0
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        with tf.variable_scope(self.name):
            # build the actual policy network
            rnn_outs = create_rnn(
                name='mean_network',
                cell_type=self._cell_type,
                output_dim=self.action_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_dim=(
                    None,
                    None,
                    self.obs_dim,
                ),
            )

            self.obs_var, self.hidden_var, self.mean_var, self.next_hidden_var, self.cell = rnn_outs

            with tf.variable_scope("log_std_network"):
                log_std_var = tf.get_variable(
                    name='log_std_var',
                    shape=(
                        1,
                        self.action_dim,
                    ),
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(self.init_log_std),
                    trainable=self.learn_std)

                self.log_std_var = tf.maximum(log_std_var,
                                              self.min_log_std,
                                              name='log_std')

            # symbolically define sampled action and distribution
            self._dist = DiagonalGaussian(self.action_dim)

            # save the policy's trainable variables in dicts
            current_scope = tf.get_default_graph().get_name_scope()
            trainable_policy_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
            self.policy_params = OrderedDict([
                (remove_scope_from_name(var.name, current_scope), var)
                for var in trainable_policy_vars
            ])
    def build_graph(self):
        """
        Builds computational graph for policy
        """
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            # build the actual policy network
            self.obs_var, self.output_var = create_mlp(
                name='network',
                output_dim=2 * self.action_dim,
                hidden_sizes=self.hidden_sizes,
                hidden_nonlinearity=self.hidden_nonlinearity,
                output_nonlinearity=self.output_nonlinearity,
                input_dim=(
                    None,
                    self.obs_dim,
                ),
            )

            self.mean_var, self.log_std_var = tf.split(self.output_var,
                                                       2,
                                                       axis=-1)

            self.log_std_var = tf.clip_by_value(self.log_std_var,
                                                LOG_SIG_MIN,
                                                LOG_SIG_MAX,
                                                name='log_std')

            # symbolically define sampled action and distribution
            self.action_var = self.mean_var + tf.random_normal(
                shape=tf.shape(self.mean_var)) * tf.exp(self.log_std_var)

            self._dist = DiagonalGaussian(self.action_dim,
                                          squashed=self.squashed)

            # save the policy's trainable variables in dicts
            current_scope = self.name
            trainable_policy_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope)
            self.policy_params = OrderedDict([
                (remove_scope_from_name(var.name, current_scope), var)
                for var in trainable_policy_vars
            ])

            self.policy_params_ph = self._create_placeholders_for_vars(
                scope=self.name + "/network")
            self.policy_params_keys = self.policy_params_ph.keys()