示例#1
0
    def test_log_likehihood_sym(self):
        log_prob_sym1 = self.policy1.distribution.log_likelihood_sym(
            self.action_ph, self.dist1_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func([self.obs, self.obs],
                                  np.ones((2, self.time_step, 1)))

        log_prob_sym2 = self.policy3.distribution.log_likelihood_sym(
            self.action_ph, self.dist3_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2([self.obs, self.obs],
                                   np.ones((2, self.time_step, 1)))
        assert np.array_equal(log_prob1, log_prob2)

        log_prob_sym1 = self.policy2.distribution.log_likelihood_sym(
            self.action_ph, self.dist2_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func([self.obs, self.obs],
                                  np.ones((2, self.time_step, 1)))

        log_prob_sym2 = self.policy4.distribution.log_likelihood_sym(
            self.action_ph, self.dist4_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2([self.obs, self.obs],
                                   np.ones((2, self.time_step, 1)))
        assert np.array_equal(log_prob1, log_prob2)
示例#2
0
    def test_log_likehihood_sym(self):
        log_prob_sym1 = self.policy1.distribution.log_likelihood_sym(
            self.action_ph, self.dist1_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func(self.obs, [[1, 1]])

        log_prob_sym2 = self.policy3.model.networks[
            'default'].dist.log_likelihood_sym(self.action_ph, self.dist3_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2(self.obs, [[1, 1]])
        assert log_prob1 == log_prob2

        log_prob_sym1 = self.policy2.distribution.log_likelihood_sym(
            self.action_ph, self.dist2_sym)
        log_prob_func = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym1)
        log_prob1 = log_prob_func(self.obs, [[1, 1]])

        log_prob_sym2 = self.policy4.model.networks[
            'default'].dist.log_likelihood_sym(self.action_ph, self.dist4_sym)
        log_prob_func2 = tensor_utils.compile_function(
            [self.obs_ph, self.action_ph], log_prob_sym2)
        log_prob2 = log_prob_func2(self.obs, [[1, 1]])
        assert log_prob1 == log_prob2
示例#3
0
    def update_opt(self,
                   loss,
                   target,
                   leq_constraint,
                   inputs,
                   constraint_name='constraint',
                   name=None,
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
         :class:`garage.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon),
         of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """
        params = target.get_params(trainable=True)
        with tf.name_scope(name, 'PenaltyLbfgsOptimizer',
                           [leq_constraint, loss, params]):
            constraint_term, constraint_value = leq_constraint
            penalty_var = tf.compat.v1.placeholder(tf.float32,
                                                   tuple(),
                                                   name='penalty')
            penalized_loss = loss + penalty_var * constraint_term

            self._target = target
            self._max_constraint_val = constraint_value
            self._constraint_name = constraint_name

            def get_opt_output():
                with tf.name_scope('get_opt_output',
                                   values=[params, penalized_loss]):
                    grads = tf.gradients(penalized_loss, params)
                    for idx, (grad, param) in enumerate(zip(grads, params)):
                        if grad is None:
                            grads[idx] = tf.zeros_like(param)
                    flat_grad = tensor_utils.flatten_tensor_variables(grads)
                    return [
                        tf.cast(penalized_loss, tf.float64),
                        tf.cast(flat_grad, tf.float64),
                    ]

            self._opt_fun = LazyDict(
                f_loss=lambda: tensor_utils.compile_function(
                    inputs, loss, log_name='f_loss'),
                f_constraint=lambda: tensor_utils.compile_function(
                    inputs, constraint_term, log_name='f_constraint'),
                f_penalized_loss=lambda: tensor_utils.compile_function(
                    inputs=inputs + [penalty_var],
                    outputs=[penalized_loss, loss, constraint_term],
                    log_name='f_penalized_loss',
                ),
                f_opt=lambda: tensor_utils.compile_function(
                    inputs=inputs + [penalty_var],
                    outputs=get_opt_output(),
                ))
示例#4
0
    def test_policy_entropy_sym(self):
        entropy_sym1 = self.policy1.distribution.entropy_sym(
            self.dist1_sym, name='entropy_sym1')
        entropy_func = tensor_utils.compile_function([self.obs_ph],
                                                     entropy_sym1)
        entropy1 = entropy_func([self.obs, self.obs])

        entropy_sym2 = self.policy3.distribution.entropy_sym(
            self.dist3_sym, name='entropy_sym1')
        entropy_func = tensor_utils.compile_function([self.obs_ph],
                                                     entropy_sym2)
        entropy2 = entropy_func([self.obs, self.obs])
        assert np.array_equal(entropy1, entropy2)
示例#5
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   name='LbfgsOptimizer',
                   **kwargs):
        """Construct operation graph for the optimizer.

        Args:
            loss (tf.Tensor): Loss objective to minimize.
            target (object): Target object to optimize. The object should
                implemenet `get_params()` and `get_param_values`.
            inputs (list[tf.Tensor]): List of input placeholders.
            extra_inputs (list[tf.Tensor]): List of extra input placeholders.
            name (str): Name scope.
            kwargs (dict): Extra unused keyword arguments. Some optimizers
                have extra input, e.g. KL constraint.

        """
        del kwargs
        self._target = target
        params = target.get_params()
        with tf.name_scope(name):

            def get_opt_output():
                """Helper function to construct graph.

                Returns:
                    list[tf.Tensor]: Loss and gradient tensor.

                """
                with tf.name_scope('get_opt_output'):
                    flat_grad = tensor_utils.flatten_tensor_variables(
                        tf.gradients(loss, params))
                    return [
                        tf.cast(loss, tf.float64),
                        tf.cast(flat_grad, tf.float64)
                    ]

            if extra_inputs is None:
                extra_inputs = list()

            self._opt_fun = LazyDict(
                f_loss=lambda: tensor_utils.compile_function(
                    inputs + extra_inputs, loss),
                f_opt=lambda: tensor_utils.compile_function(
                    inputs=inputs + extra_inputs,
                    outputs=get_opt_output(),
                ))
示例#6
0
    def _build_entropy_term(self, i):
        with tf.name_scope("policy_entropy"):
            if self.policy.recurrent:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.obs_var,
                    i.policy_state_info_vars,
                    name="policy_dist_info")
            else:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.flat.obs_var,
                    i.flat.policy_state_info_vars,
                    name="policy_dist_info_flat")

            policy_entropy_flat = self.policy.distribution.entropy_sym(
                policy_dist_info_flat)
            policy_entropy = tf.reshape(policy_entropy_flat,
                                        [-1, self.max_path_length])

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            policy_entropy = tf.reduce_mean(policy_entropy * i.valid_var)

        self.f_policy_entropy = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            policy_entropy,
            log_name="f_policy_entropy")

        return policy_entropy
示例#7
0
    def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
        :class:`garage.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon),
         of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """
        with tf.name_scope(self._name,
                           values=[
                               loss,
                               target.get_params(trainable=True), inputs,
                               extra_inputs
                           ]):

            self._target = target

            self._train_op = self._tf_optimizer.minimize(
                loss, var_list=target.get_params(trainable=True))

            # updates = OrderedDict(
            #     [(k, v.astype(k.dtype)) for k, v in updates.iteritems()])

            if extra_inputs is None:
                extra_inputs = list()
            self._input_vars = inputs + extra_inputs
            self._opt_fun = LazyDict(
                f_loss=lambda: tensor_utils.compile_function(
                    inputs + extra_inputs, loss), )
示例#8
0
    def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs):
        """Construct operation graph for the optimizer.

        Args:
            loss (tf.Tensor): Loss objective to minimize.
            target (object): Target object to optimize. The object should
                implemenet `get_params()` and `get_param_values`.
            inputs (list[tf.Tensor]): List of input placeholders.
            extra_inputs (list[tf.Tensor]): List of extra input placeholders.
            kwargs (dict): Extra unused keyword arguments. Some optimizers
                have extra input, e.g. KL constraint.

        """
        del kwargs
        with tf.name_scope(self._name):
            self._target = target
            tf_optimizer = make_optimizer(self._tf_optimizer,
                                          **self._learning_rate)
            self._train_op = tf_optimizer.minimize(
                loss, var_list=target.get_params())

            if extra_inputs is None:
                extra_inputs = list()
            self._input_vars = inputs + extra_inputs
            self._opt_fun = LazyDict(
                f_loss=lambda: tensor_utils.compile_function(
                    inputs + extra_inputs, loss), )
示例#9
0
    def _initialize(self):
        input_var = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, ) +
                                             self._input_shape)
        ys_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                          name='ys',
                                          shape=(None, self._output_dim))

        self._old_network = self._old_model.build(input_var)
        (norm_dist, norm_mean, norm_log_std, _, mean, _, self._x_mean,
         self._x_std, self._y_mean,
         self._y_std) = self.build(input_var).outputs
        self._old_model.parameters = self.parameters

        normalized_ys_var = (ys_var - self._y_mean) / self._y_std
        old_normalized_dist = self._old_network.normalized_dist

        mean_kl = tf.reduce_mean(old_normalized_dist.kl_divergence(norm_dist))
        loss = -tf.reduce_mean(norm_dist.log_prob(normalized_ys_var))
        self._f_predict = tensor_utils.compile_function([input_var], mean)
        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[norm_mean, norm_log_std],
        )
        if self._use_trust_region:
            optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step)
        optimizer_args['inputs'] = [input_var, ys_var]
        with tf.name_scope('update_opt'):
            self._optimizer.update_opt(**optimizer_args)
示例#10
0
    def test_kl_sym(self):
        kl_diff_sym1 = self.policy1.distribution.kl_sym(
            self.dist1_sym, self.dist2_sym)
        objective1 = tf.reduce_mean(kl_diff_sym1)

        kl_func = tensor_utils.compile_function([self.obs_ph], objective1)
        kl1 = kl_func([self.obs, self.obs])

        kl_diff_sym2 = self.policy3.distribution.kl_sym(
            self.dist3_sym, self.dist4_sym)
        objective2 = tf.reduce_mean(kl_diff_sym2)

        kl_func = tensor_utils.compile_function([self.obs_ph], objective2)
        kl2 = kl_func([self.obs, self.obs])

        assert np.array_equal(kl1, kl2)
示例#11
0
    def update_opt(self, f, target, inputs, reg_coeff):
        self.target = target
        self.reg_coeff = reg_coeff
        params = target.get_params(trainable=True)

        constraint_grads = tf.gradients(f, xs=params)
        for idx, (grad, param) in enumerate(zip(constraint_grads, params)):
            if grad is None:
                constraint_grads[idx] = tf.zeros_like(param)

        xs = tuple([
            tensor_utils.new_tensor_like(p.name.split(":")[0], p)
            for p in params
        ])

        def Hx_plain():
            Hx_plain_splits = tf.gradients(
                tf.reduce_sum(
                    tf.stack([
                        tf.reduce_sum(g * x)
                        for g, x in zip(constraint_grads, xs)
                    ])), params)
            for idx, (Hx, param) in enumerate(zip(Hx_plain_splits, params)):
                if Hx is None:
                    Hx_plain_splits[idx] = tf.zeros_like(param)
            return tensor_utils.flatten_tensor_variables(Hx_plain_splits)

        self._opt_fun = LazyDict(
            f_Hx_plain=lambda: tensor_utils.compile_function(
                inputs=inputs + xs,
                outputs=Hx_plain(),
                log_name="f_Hx_plain",
            ), )
示例#12
0
文件: npo.py 项目: seraliilhan/garage
    def _build_entropy_term(self, i):
        """Build policy entropy tensor.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy entropy.

        """
        pol_dist = self.policy.distribution

        with tf.name_scope('policy_entropy'):
            if self._use_neg_logli_entropy:
                policy_entropy = -pol_dist.log_prob(i.action_var,
                                                    name='policy_log_likeli')
            else:
                policy_entropy = pol_dist.entropy()

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            if self._stop_entropy_gradient:
                policy_entropy = tf.stop_gradient(policy_entropy)

        # dense form, match the shape of advantage
        policy_entropy = tf.reshape(policy_entropy, [-1, self.max_path_length])

        self._f_policy_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs), policy_entropy)

        return policy_entropy
示例#13
0
    def _initialize(self):
        input_var = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, ) +
                                             self._input_shape)

        with tf.compat.v1.variable_scope(self._name) as vs:
            self._variable_scope = vs
            self.model.build(input_var)
            ys_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                              name='ys',
                                              shape=(None, self._output_dim))

            y_hat = self.model.networks['default'].y_hat
            loss = tf.reduce_mean(tf.square(y_hat - ys_var))

            self._f_predict = tensor_utils.compile_function([input_var], y_hat)
            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[ys_var],
            )

            optimizer_args['inputs'] = [input_var, ys_var]

            with tf.name_scope('update_opt'):
                self._optimizer.update_opt(**optimizer_args)
示例#14
0
    def _build_embedding_kl(self, i):
        dist = self.policy._embedding._dist
        with tf.name_scope("embedding_kl"):
            # new distribution
            embed_dist_info_flat = self.policy._embedding.dist_info_sym(
                i.flat.task_var,
                i.flat.embed_state_info_vars,
                name="embed_dist_info_flat")
            embed_dist_info_valid = filter_valids_dict(
                embed_dist_info_flat,
                i.flat.valid_var,
                name="embed_dist_info_valid")

            # calculate KL divergence
            kl = dist.kl_sym(i.valid.embed_old_dist_info_vars,
                             embed_dist_info_valid)
            mean_kl = tf.reduce_mean(kl)

            # Diagnostic function
            self.f_embedding_kl = tensor_utils.compile_function(
                flatten_inputs(self._policy_opt_inputs),
                mean_kl,
                log_name="f_embedding_kl")

            return mean_kl
示例#15
0
    def update_hvp(self, f, target, inputs, reg_coeff, name='PearlmutterHvp'):
        """Build the symbolic graph to compute the Hessian-vector product.

        Args:
            f (tf.Tensor): The function whose Hessian needs to be computed.
            target (garage.tf.policies.Policy): A parameterized object to
                optimize over.
            inputs (tuple[tf.Tensor]): The inputs for function f.
            reg_coeff (float): A small value so that A -> A + reg*I.
            name (str): Name to be used in tf.name_scope.

        """
        self._target = target
        self._reg_coeff = reg_coeff
        params = target.get_params()
        with tf.name_scope(name):
            constraint_grads = tf.gradients(f,
                                            xs=params,
                                            name='gradients_constraint')
            for idx, (grad, param) in enumerate(zip(constraint_grads, params)):
                if grad is None:
                    constraint_grads[idx] = tf.zeros_like(param)

            xs = tuple([
                tensor_utils.new_tensor_like(p.name.split(':')[0], p)
                for p in params
            ])

            def hx_plain():
                """Computes product of Hessian(f) and vector v.

                Returns:
                    tf.Tensor: Symbolic result.

                """
                with tf.name_scope('hx_plain'):
                    with tf.name_scope('hx_function'):
                        hx_f = tf.reduce_sum(
                            tf.stack([
                                tf.reduce_sum(g * x)
                                for g, x in zip(constraint_grads, xs)
                            ])),
                    hx_plain_splits = tf.gradients(hx_f,
                                                   params,
                                                   name='gradients_hx_plain')
                    for idx, (hx,
                              param) in enumerate(zip(hx_plain_splits,
                                                      params)):
                        if hx is None:
                            hx_plain_splits[idx] = tf.zeros_like(param)
                    return tensor_utils.flatten_tensor_variables(
                        hx_plain_splits)

            self._hvp_fun = LazyDict(
                f_hx_plain=lambda: tensor_utils.compile_function(
                    inputs=inputs + xs,
                    outputs=hx_plain(),
                    log_name='f_hx_plain',
                ), )
示例#16
0
    def _initialize(self):
        input_var = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, ) +
                                             self._input_shape)

        with tf.compat.v1.variable_scope(self._variable_scope):
            self.model.build(input_var)

            ys_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                              name='ys',
                                              shape=(None, self._output_dim))

            old_prob_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                                    name='old_prob',
                                                    shape=(None,
                                                           self._output_dim))

            y_hat = self.model.networks['default'].y_hat

            old_info_vars = dict(prob=old_prob_var)
            info_vars = dict(prob=y_hat)

            self._dist = Categorical(self._output_dim)
            mean_kl = tf.reduce_mean(
                self._dist.kl_sym(old_info_vars, info_vars))

            loss = -tf.reduce_mean(
                self._dist.log_likelihood_sym(ys_var, info_vars))

            predicted = tf.one_hot(tf.argmax(y_hat, axis=1),
                                   depth=self._output_dim)

            self._f_predict = tensor_utils.compile_function([input_var],
                                                            predicted)
            self._f_prob = tensor_utils.compile_function([input_var], y_hat)

            self._optimizer.update_opt(loss=loss,
                                       target=self,
                                       network_output=[y_hat],
                                       inputs=[input_var, ys_var])
            self._tr_optimizer.update_opt(
                loss=loss,
                target=self,
                network_output=[y_hat],
                inputs=[input_var, ys_var, old_prob_var],
                leq_constraint=(mean_kl, self._max_kl_step))
示例#17
0
    def _build_net(self, reuse=None, custom_getter=None, trainable=None):
        """
        Set up q network based on class attributes. This function uses layers
        defined in rllab.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            custom_getter: A customized getter object used to get variables.
            trainable: A bool indicates whether variables are trainable.
        """
        with tf.variable_scope(self.name,
                               reuse=reuse,
                               custom_getter=custom_getter):
            l_obs = L.InputLayer(shape=(None, self._obs_dim), name="obs")
            l_action = L.InputLayer(shape=(None, self._action_dim),
                                    name="actions")

            n_layers = len(self._hidden_sizes) + 1

            if n_layers > 1:
                action_merge_layer = \
                    (self._action_merge_layer % n_layers + n_layers) % n_layers
            else:
                action_merge_layer = 1

            l_hidden = l_obs

            for idx, size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hidden = batch_norm(l_hidden)

                if idx == action_merge_layer:
                    l_hidden = L.ConcatLayer([l_hidden, l_action])

                l_hidden = L.DenseLayer(l_hidden,
                                        num_units=size,
                                        nonlinearity=self._hidden_nonlinearity,
                                        trainable=trainable,
                                        name="hidden_%d" % (idx + 1))

            if action_merge_layer == n_layers:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_output = L.DenseLayer(l_hidden,
                                    num_units=1,
                                    nonlinearity=self._output_nonlinearity,
                                    trainable=trainable,
                                    name="output")

            output_var = L.get_output(l_output)

        self._f_qval = tensor_utils.compile_function(
            [l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action

        LayersPowered.__init__(self, [l_output])
示例#18
0
    def __init__(
        self,
        env_spec,
        conv_filters,
        conv_filter_sizes,
        conv_strides,
        conv_pads,
        hidden_sizes=[],
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.softmax,
        prob_network=None,
        name="CategoricalConvPolicy",
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected
        hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other
         network params
        are ignored
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)

        Serializable.quick_init(self, locals())

        self._name = name
        self._env_spec = env_spec
        self._prob_network_name = "prob_network"

        with tf.variable_scope(name, "CategoricalConvPolicy"):
            if prob_network is None:
                prob_network = ConvNetwork(
                    input_shape=env_spec.observation_space.shape,
                    output_dim=env_spec.action_space.n,
                    conv_filters=conv_filters,
                    conv_filter_sizes=conv_filter_sizes,
                    conv_strides=conv_strides,
                    conv_pads=conv_pads,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    name="conv_prob_network",
                )

            with tf.name_scope(self._prob_network_name):
                out_prob = L.get_output(prob_network.output_layer)

            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var], [out_prob])

            self._dist = Categorical(env_spec.action_space.n)

            super(CategoricalConvPolicy, self).__init__(env_spec)
            LayersPowered.__init__(self, [prob_network.output_layer])
    def build_net(self, trainable=True, name=None):
        """
        Set up q network based on class attributes. This function uses layers
        defined in garage.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            trainable: A bool indicates whether variables are trainable.
        """
        with tf.variable_scope(name):
            l_obs = L.InputLayer(shape=(None, self._obs_dim), name="obs")
            l_action = L.InputLayer(shape=(None, self._action_dim),
                                    name="actions")

            n_layers = len(self._hidden_sizes) + 1

            if n_layers > 1:
                action_merge_layer = \
                    (self._action_merge_layer % n_layers + n_layers) % n_layers
            else:
                action_merge_layer = 1

            l_hidden = l_obs

            for idx, size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hidden = batch_norm(l_hidden)

                if idx == action_merge_layer:
                    l_hidden = L.ConcatLayer([l_hidden, l_action])

                l_hidden = L.DenseLayer(l_hidden,
                                        num_units=size,
                                        nonlinearity=self._hidden_nonlinearity,
                                        trainable=trainable,
                                        name="hidden_%d" % (idx + 1))

            if action_merge_layer == n_layers:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_output = L.DenseLayer(l_hidden,
                                    num_units=1,
                                    nonlinearity=self._output_nonlinearity,
                                    trainable=trainable,
                                    name="output")

            output_var = L.get_output(l_output)

        f_qval = tensor_utils.compile_function(
            [l_obs.input_var, l_action.input_var], output_var)
        output_layer = l_output
        obs_layer = l_obs
        action_layer = l_action

        return f_qval, output_layer, obs_layer, action_layer
示例#20
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   name=None,
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
        :class:`garage.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon),
         of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """
        self._target = target
        params = target.get_params(trainable=True)
        with tf.name_scope(name, "LbfgsOptimizer",
                           [loss, inputs, params, extra_inputs]):

            def get_opt_output():
                with tf.name_scope("get_opt_output", [loss, params]):
                    flat_grad = tensor_utils.flatten_tensor_variables(
                        tf.gradients(loss, params))
                    return [
                        tf.cast(loss, tf.float64),
                        tf.cast(flat_grad, tf.float64)
                    ]

            if extra_inputs is None:
                extra_inputs = list()

            self._opt_fun = ext.lazydict(
                f_loss=lambda: tensor_utils.compile_function(
                    inputs + extra_inputs, loss),
                f_opt=lambda: tensor_utils.compile_function(
                    inputs=inputs + extra_inputs,
                    outputs=get_opt_output(),
                ))
示例#21
0
    def _build_entropy_term(self, i):
        with tf.name_scope("policy_entropy"):
            if self.policy.recurrent:
                policy_dist_info = self.policy.dist_info_sym(
                    i.obs_var,
                    i.policy_state_info_vars,
                    name="policy_dist_info")

                policy_neg_log_likeli = self.policy.distribution.log_likelihood_sym(  # noqa: E501
                    i.action_var,
                    policy_dist_info,
                    name="policy_log_likeli")

                if self._use_neg_logli_entropy:
                    policy_entropy = policy_neg_log_likeli
                else:
                    policy_entropy = self.policy.distribution.entropy_sym(
                        policy_dist_info)

            else:
                policy_dist_info_flat = self.policy.dist_info_sym(
                    i.flat.obs_var,
                    i.flat.policy_state_info_vars,
                    name="policy_dist_info_flat_entropy")

                policy_dist_info_valid = filter_valids_dict(
                    policy_dist_info_flat,
                    i.flat.valid_var,
                    name="policy_dist_info_valid")

                policy_neg_log_likeli_valid = self.policy.distribution.log_likelihood_sym(  # noqa: E501
                    i.valid.action_var,
                    policy_dist_info_valid,
                    name="policy_log_likeli")

                if self._use_neg_logli_entropy:
                    policy_entropy = policy_neg_log_likeli_valid
                else:
                    policy_entropy = self.policy.distribution.entropy_sym(
                        policy_dist_info_valid)

            # This prevents entropy from becoming negative for small policy std
            if self._use_softplus_entropy:
                policy_entropy = tf.nn.softplus(policy_entropy)

            if self._stop_entropy_gradient:
                policy_entropy = tf.stop_gradient(policy_entropy)

        self.f_policy_entropy = tensor_utils.compile_function(
            flatten_inputs(self._policy_opt_inputs),
            policy_entropy,
            log_name="f_policy_entropy")

        return policy_entropy
示例#22
0
 def __init__(self, dim, name=None):
     with tf.variable_scope(name, "Categorical"):
         self._dim = dim
         self._name = name
         weights_var = tf.placeholder(
             dtype=tf.float32, shape=(None, dim), name="weights")
         self._f_sample = compile_function(
             inputs=[weights_var],
             outputs=tf.multinomial(
                 tf.log(weights_var + 1e-8), num_samples=1)[:, 0],
         )
示例#23
0
    def test_likelihood_ratio_sym(self):
        likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist1_sym,
            self.dist2_sym,
            name='li_ratio_sym1')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym1)
        likelihood_ratio1 = likelihood_ratio_func([[1, 1]], self.obs)

        likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist3_sym,
            self.dist4_sym,
            name='li_ratio_sym2')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym2)
        likelihood_ratio2 = likelihood_ratio_func([[1, 1]], self.obs)

        assert likelihood_ratio1 == likelihood_ratio2
示例#24
0
 def __init__(self, dim, name=None):
     with tf.compat.v1.variable_scope(name, 'Categorical'):
         self._dim = dim
         self._name = name
         weights_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                                shape=(None, dim),
                                                name='weights')
         self._f_sample = compile_function(
             inputs=[weights_var],
             outputs=tf.random.categorical(tf.math.log(weights_var + 1e-8),
                                           num_samples=1)[:, 0],
         )
示例#25
0
    def __init__(
        self,
        env_spec,
        name='CategoricalMLPPolicy',
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        prob_network=None,
    ):
        """
        CategoricalMLPPolicy.

        A policy that uses a MLP to estimate a categorical distribution.

        Args:
            env_spec (garage.envs.env_spec.EnvSpec): Environment specification.
            hidden_sizes (list[int]): Output dimension of dense layer(s).
                For example, (32, 32) means the MLP of this policy consists
                of two hidden layers, each with 32 hidden units.
            hidden_nonlinearity: Activation function for
                intermediate dense layer(s).
            prob_network (tf.Tensor): manually specified network for this
                policy. If None, a MLP with the network parameters will be
                created. If not None, other network params are ignored.
        """
        assert isinstance(env_spec.action_space, akro.Discrete)

        Serializable.quick_init(self, locals())

        self.name = name
        self._prob_network_name = 'prob_network'
        with tf.variable_scope(name, 'CategoricalMLPPolicy'):
            if prob_network is None:
                prob_network = MLP(
                    input_shape=(env_spec.observation_space.flat_dim, ),
                    output_dim=env_spec.action_space.n,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name=self._prob_network_name,
                )

            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            with tf.name_scope(self._prob_network_name):
                prob_network_outputs = L.get_output(prob_network.output_layer)
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var], prob_network_outputs)

            self._dist = Categorical(env_spec.action_space.n)

            super(CategoricalMLPPolicy, self).__init__(env_spec)
            LayersPowered.__init__(self, [prob_network.output_layer])
示例#26
0
    def test_likelihood_ratio_sym(self):
        likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist1_sym,
            self.dist2_sym,
            name='li_ratio_sym1')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym1)
        likelihood_ratio1 = likelihood_ratio_func(
            np.ones((2, 1, 1)), [self.obs, self.obs])

        likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym(
            self.action_ph,
            self.dist3_sym,
            self.dist4_sym,
            name='li_ratio_sym2')
        likelihood_ratio_func = tensor_utils.compile_function(
            [self.action_ph, self.obs_ph], likelihood_ratio_sym2)
        likelihood_ratio2 = likelihood_ratio_func(
            np.ones((2, 1, 1)), [self.obs, self.obs])

        assert np.array_equal(likelihood_ratio1, likelihood_ratio2)
示例#27
0
    def update_opt(self,
                   target,
                   leq_constraint,
                   inputs,
                   extra_inputs=None,
                   constraint_name="constraint",
                   *args,
                   **kwargs):
        """Update the internal tensowflow operations.

        Parameters
        ----------
        target :
            A parameterized object to optimize over. It should implement methods of the
            :py:class:`garage.core.paramerized.Parameterized` class.
        leq_constraint : :py:class:'tensorflow.Tensor'
            The variable to be constrained.
        inputs :
            A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed
            that the first dimension of these inputs should correspond to the number of data points.
        extra_inputs :
            A list of symbolic variables as extra inputs which should not be subsampled.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        # constraint_term, constraint_value = leq_constraint
        constraint_term = leq_constraint

        # params = target.get_params(trainable=True)

        self._hvp_approach.update_hvp(f=constraint_term,
                                      target=target,
                                      inputs=inputs + extra_inputs,
                                      reg_coeff=self._reg_coeff)

        self._target = target
        # self._max_constraint_val = constraint_value
        self._max_constraint_val = np.inf
        self._constraint_name = constraint_name

        self._opt_fun = LazyDict(
            f_constraint=lambda: tensor_utils.compile_function(
                inputs=inputs + extra_inputs,
                outputs=constraint_term,
                log_name="constraint",
            ), )
    def _initialize(self):
        input_var = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, ) +
                                             self._input_shape)
        self._old_model.build(input_var)
        self._old_model.parameters = self.model.parameters

        with tf.compat.v1.variable_scope(self._variable_scope):
            self.model.build(input_var)

            ys_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                              name='ys',
                                              shape=(None, self._output_dim))

            y_mean_var = self.model.networks['default'].y_mean
            y_std_var = self.model.networks['default'].y_std
            means_var = self.model.networks['default'].mean

            normalized_means_var = self.model.networks[
                'default'].normalized_mean
            normalized_log_stds_var = self.model.networks[
                'default'].normalized_log_std

            normalized_ys_var = (ys_var - y_mean_var) / y_std_var

            old_normalized_dist = self._old_model.networks[
                'default'].normalized_dist
            normalized_dist = self.model.networks['default'].normalized_dist

            mean_kl = tf.reduce_mean(
                old_normalized_dist.kl_divergence(normalized_dist))

            loss = -tf.reduce_mean(normalized_dist.log_prob(normalized_ys_var))

            self._f_predict = tensor_utils.compile_function([input_var],
                                                            means_var)

            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[
                    normalized_means_var, normalized_log_stds_var
                ],
            )

            if self._use_trust_region:
                optimizer_args['leq_constraint'] = (mean_kl, self._max_kl_step)
            optimizer_args['inputs'] = [input_var, ys_var]

            with tf.name_scope('update_opt'):
                self._optimizer.update_opt(**optimizer_args)
    def update_opt(self, f, target, inputs, reg_coeff, name=None):
        self.target = target
        self.reg_coeff = reg_coeff
        params = target.get_params(trainable=True)

        with tf.name_scope(name, "FiniteDifferenceHvp",
                           [f, inputs, params, target]):
            constraint_grads = tf.gradients(f,
                                            xs=params,
                                            name="gradients_constraint")
            for idx, (grad, param) in enumerate(zip(constraint_grads, params)):
                if grad is None:
                    constraint_grads[idx] = tf.zeros_like(param)

            flat_grad = tensor_utils.flatten_tensor_variables(constraint_grads)

            def f_hx_plain(*args):
                with tf.name_scope("f_hx_plain", values=[inputs, self.target]):
                    inputs_ = args[:len(inputs)]
                    xs = args[len(inputs):]
                    flat_xs = np.concatenate(
                        [np.reshape(x, (-1, )) for x in xs])
                    param_val = self.target.get_param_values(trainable=True)
                    eps = np.cast['float32'](
                        self.base_eps / (np.linalg.norm(param_val) + 1e-8))
                    self.target.set_param_values(param_val + eps * flat_xs,
                                                 trainable=True)
                    flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_)
                    self.target.set_param_values(param_val, trainable=True)
                    if self.symmetric:
                        self.target.set_param_values(param_val - eps * flat_xs,
                                                     trainable=True)
                        flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_)
                        hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps)
                        self.target.set_param_values(param_val, trainable=True)
                    else:
                        flat_grad = self.opt_fun["f_grad"](*inputs_)
                        hx = (flat_grad_dvplus - flat_grad) / eps
                    return hx

            self.opt_fun = ext.LazyDict(
                f_grad=lambda: tensor_utils.compile_function(
                    inputs=inputs,
                    outputs=flat_grad,
                    log_name="f_grad",
                ),
                f_hx_plain=lambda: f_hx_plain,
            )
示例#30
0
    def _build_net(self, reuse=None, custom_getter=None, trainable=None):
        """
        Set up q network based on class attributes.

        This function uses layers defined in garage.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            custom_getter: A customized getter object used to get variables.
            trainable: A bool indicates whether variables are trainable.
        """
        with tf.variable_scope(self.name,
                               reuse=reuse,
                               custom_getter=custom_getter):
            l_in = layers.InputLayer(shape=(None, self._obs_dim), name="obs")

            l_hidden = l_in
            for idx, hidden_size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hidden = batch_norm(l_hidden)

                l_hidden = layers.DenseLayer(
                    l_hidden,
                    hidden_size,
                    nonlinearity=self._hidden_nonlinearity,
                    trainable=trainable,
                    name="hidden_%d" % idx)

            l_output = layers.DenseLayer(
                l_hidden,
                self._action_dim,
                nonlinearity=self._output_nonlinearity,
                trainable=trainable,
                name="output")

            with tf.name_scope(self._policy_network_name):
                action = layers.get_output(l_output)
                scaled_action = tf.multiply(action,
                                            self._action_bound,
                                            name="scaled_action")

        self._f_prob_online = tensor_utils.compile_function(
            inputs=[l_in.input_var], outputs=scaled_action)
        self._output_layer = l_output
        self._obs_layer = l_in

        LayersPowered.__init__(self, [l_output])