Exemplo n.º 1
0
    def __init__(self, _p, inputs, s, costs, h=None, ha=None):
        '''Constructs and compiles the necessary Theano functions.

        p : list of Theano shared variables
            Parameters of the model to be optimized.
        inputs : list of Theano variables
            Symbolic variables that are inputs to your graph (they should also
            include your model 'output'). Your training examples must fit
            these.
        s : Theano variable
            Symbolic variable with respect to which the Hessian of the
            objective is positive-definite, implicitly defining the
            Gauss-Newton matrix.
            Typically, it is the activation of the output layer.
        costs : list of Theano variables
            Monitoring costs, the first of which will be the optimized
            objective.
        h: Theano variable or None
            Structural damping is applied to this variable (typically the
            hidden units of an RNN).
        ha: Theano variable or None
            Symbolic variable that implicitly defines the Gauss-Newton matrix
            for the structural damping term (typically the activation of the
            hidden layer). If None, it will be set to `h`.'''

        self.p = _p
        self.shapes = [i.get_value().shape for i in _p]
        self.sizes = list(map(numpy.prod, self.shapes))
        self.positions = numpy.cumsum([0] + self.sizes)[:-1]

        g = T.grad(costs[0], _p)
        g = list(map(T.as_tensor_variable, g))  # for CudaNdarray
        self.f_gc = compile_function(inputs,
                                     g + costs)  # during gradient computation
        self.f_cost = compile_function(inputs,
                                       costs)  # for quick cost evaluation

        symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4

        v = [symbolic_types[len(i)]() for i in self.shapes]
        gv = gauss_newton_product(costs[0], _p, v, s)

        coefficient = T.scalar()  # this is lambda*mu
        if h is not None:  # structural damping with cross-entropy
            h_constant = symbolic_types[h.ndim](
            )  # T.Rop doesn't support `consider_constant` yet, so use `givens`
            structural_damping = coefficient * (
                -h_constant * T.log(h + 1e-10) -
                (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0]
            if ha is None:
                ha = h
            gv_damping = gauss_newton_product(structural_damping, _p, v, ha)
            gv = [a + b for a, b in zip(gv, gv_damping)]
            givens = {h_constant: h}
        else:
            givens = {}

        self.function_Gv = compile_function(
            inputs + v + [coefficient], gv, givens=givens)
Exemplo n.º 2
0
    def init_opt(self):

        # First, create "target" policy and Q functions
        target_policy = pickle.loads(pickle.dumps(self.policy))
        target_qf = pickle.loads(pickle.dumps(self.qf))

        # y need to be computed first
        obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )

        # The yi values are computed separately as above and then passed to
        # the training functions below
        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        yvar = TT.vector('ys')

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([TT.sum(TT.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)

        qf_loss = TT.mean(TT.square(yvar - qval))
        qf_reg_loss = qf_loss + qf_weight_decay_term

        policy_weight_decay_term = 0.5 * self.policy_weight_decay * sum([
            TT.sum(TT.square(param))
            for param in self.policy.get_params(regularizable=True)
        ])
        policy_qval = self.qf.get_qval_sym(obs,
                                           self.policy.get_action_sym(obs),
                                           deterministic=True)
        policy_surr = -TT.mean(policy_qval)

        policy_reg_surr = policy_surr + policy_weight_decay_term

        qf_updates = self.qf_update_method(qf_reg_loss,
                                           self.qf.get_params(trainable=True))
        policy_updates = self.policy_update_method(
            policy_reg_surr, self.policy.get_params(trainable=True))

        f_train_qf = tensor_utils.compile_function(inputs=[yvar, obs, action],
                                                   outputs=[qf_loss, qval],
                                                   updates=qf_updates)

        f_train_policy = tensor_utils.compile_function(inputs=[obs],
                                                       outputs=policy_surr,
                                                       updates=policy_updates)

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_policy=f_train_policy,
            target_qf=target_qf,
            target_policy=target_policy,
        )
Exemplo n.º 3
0
    def update_opt(self,
                   loss,
                   target,
                   leq_constraint,
                   inputs,
                   constraint_name="constraint",
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
         :class:`garage.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon),
         of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """
        constraint_term, constraint_value = leq_constraint
        penalty_var = TT.scalar("penalty")
        penalized_loss = loss + penalty_var * constraint_term

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        def get_opt_output():
            flat_grad = flatten_tensor_variables(
                theano.grad(
                    penalized_loss,
                    target.get_params(trainable=True),
                    disconnected_inputs='ignore'))
            return [
                penalized_loss.astype('float64'),
                flat_grad.astype('float64')
            ]

        self._opt_fun = LazyDict(
            f_loss=lambda: compile_function(inputs, loss, log_name="f_loss"),
            f_constraint=lambda: compile_function(
                inputs, constraint_term, log_name="f_constraint"),
            f_penalized_loss=lambda: compile_function(
                inputs=inputs + [penalty_var],
                outputs=[penalized_loss, loss, constraint_term],
                log_name="f_penalized_loss",
            ),
            f_opt=lambda: compile_function(
                inputs=inputs + [penalty_var],
                outputs=get_opt_output(),
                log_name="f_opt"))
Exemplo n.º 4
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   network_outputs,
                   extra_inputs=None):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
        :class:`garage.core.paramerized.Parameterized` class.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self._target = target

        if extra_inputs is None:
            extra_inputs = list()

        self._hf_optimizer = HfOptimizer(
            _p=target.get_params(trainable=True),
            inputs=(inputs + extra_inputs),
            s=network_outputs,
            costs=[loss],
        )

        self._opt_fun = LazyDict(
            f_loss=lambda: compile_function(inputs + extra_inputs, loss), )
Exemplo n.º 5
0
 def test_gru_network(self):
     network = GRUNetwork(
         input_shape=(2, 3),
         output_dim=5,
         hidden_dim=4,
     )
     f_output = tensor_utils.compile_function(
         inputs=[network.input_layer.input_var],
         outputs=L.get_output(network.output_layer))
     assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5)
Exemplo n.º 6
0
    def __init__(
        self,
        name,
        env_spec,
        conv_filters,
        conv_filter_sizes,
        conv_strides,
        conv_pads,
        hidden_sizes=[],
        hidden_nonlinearity=NL.rectify,
        output_nonlinearity=NL.softmax,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected
        hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden
        layer
        :param prob_network: manually specified network for this
        policy, other network params are ignored
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)

        Serializable.quick_init(self, locals())

        self._env_spec = env_spec

        if prob_network is None:
            prob_network = ConvNetwork(
                input_shape=env_spec.observation_space.shape,
                output_dim=env_spec.action_space.n,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
                name="prob_network",
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = tensor_utils.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalConvPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 7
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   gradients=None,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
         :class:`garage.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon),
         of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self._target = target

        if gradients is None:
            gradients = theano.grad(loss,
                                    target.get_params(trainable=True),
                                    disconnected_inputs='ignore')
        updates = self._update_method(gradients,
                                      target.get_params(trainable=True))
        updates = OrderedDict([(k, v.astype(k.dtype))
                               for k, v in updates.items()])

        if extra_inputs is None:
            extra_inputs = list()

        self._opt_fun = ext.LazyDict(
            f_loss=lambda: tensor_utils.compile_function(
                inputs + extra_inputs, loss),
            f_opt=lambda: tensor_utils.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                updates=updates,
            ))
Exemplo n.º 8
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   gradients=None,
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
         :class:`garage.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon),
         of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :param gradients: symbolic expressions for the gradients of trainable
         parameters of the target. By default this will be computed by calling
         theano.grad
        :return: No return value.
        """

        self._target = target

        def get_opt_output(gradients):
            if gradients is None:
                gradients = theano.grad(
                    loss, target.get_params(trainable=True))
            flat_grad = flatten_tensor_variables(gradients)
            return [loss.astype('float64'), flat_grad.astype('float64')]

        if extra_inputs is None:
            extra_inputs = list()

        self._opt_fun = LazyDict(
            f_loss=lambda: compile_function(inputs + extra_inputs, loss),
            f_opt=lambda: compile_function(
                inputs=inputs + extra_inputs,
                outputs=get_opt_output(gradients),
            ))
Exemplo n.º 9
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=NL.rectify,
                 hidden_w_init=LI.HeUniform(),
                 hidden_b_init=LI.Constant(0.),
                 output_nonlinearity=NL.tanh,
                 output_w_init=LI.Uniform(-3e-3, 3e-3),
                 output_b_init=LI.Uniform(-3e-3, 3e-3),
                 bn=False):

        assert isinstance(env_spec.action_space, Box)

        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))

        l_hidden = l_obs
        if bn:
            l_hidden = batch_norm(l_hidden)

        for idx, size in enumerate(hidden_sizes):
            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_w_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % idx)
            if bn:
                l_hidden = batch_norm(l_hidden)

        l_output = L.DenseLayer(
            l_hidden,
            num_units=env_spec.action_space.flat_dim,
            W=output_w_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output")

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers

        action_var = L.get_output(l_output, deterministic=True)
        self._output_layer = l_output

        self._f_actions = tensor_utils.compile_function([l_obs.input_var],
                                                        action_var)

        super(DeterministicMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [l_output])
Exemplo n.º 10
0
    def update_opt(self, f, target, inputs, reg_coeff):
        self.target = target
        self.reg_coeff = reg_coeff

        params = target.get_params(trainable=True)

        constraint_grads = theano.grad(f,
                                       wrt=params,
                                       disconnected_inputs='warn')
        flat_grad = tensor_utils.flatten_tensor_variables(constraint_grads)

        def f_hx_plain(*args):
            inputs_ = args[:len(inputs)]
            xs = args[len(inputs):]
            flat_xs = np.concatenate([np.reshape(x, (-1, )) for x in xs])
            param_val = self.target.get_param_values(trainable=True)
            eps = np.cast['float32'](self.base_eps /
                                     (np.linalg.norm(param_val) + 1e-8))
            self.target.set_param_values(param_val + eps * flat_xs,
                                         trainable=True)
            flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_)
            if self.symmetric:
                self.target.set_param_values(param_val - eps * flat_xs,
                                             trainable=True)
                flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_)
                hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps)
                self.target.set_param_values(param_val, trainable=True)
            else:
                self.target.set_param_values(param_val, trainable=True)
                flat_grad = self.opt_fun["f_grad"](*inputs_)
                hx = (flat_grad_dvplus - flat_grad) / eps
            return hx

        self.opt_fun = ext.LazyDict(
            f_grad=lambda: tensor_utils.compile_function(
                inputs=inputs,
                outputs=flat_grad,
                log_name="f_grad",
            ),
            f_hx_plain=lambda: f_hx_plain,
        )
Exemplo n.º 11
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        num_seq_inputs=1,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: sizes list for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other
         network params
        are ignored
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)

        Serializable.quick_init(self, locals())

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim *
                             num_seq_inputs, ),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = tensor_utils.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 12
0
    def update_opt(self, f, target, inputs, reg_coeff):
        self.target = target
        self.reg_coeff = reg_coeff
        params = target.get_params(trainable=True)

        constraint_grads = theano.grad(f,
                                       wrt=params,
                                       disconnected_inputs='warn')
        xs = tuple([tensor_utils.new_tensor_like("%s x" % p.name, p) \
                    for p in params])

        def hx_plain():
            hx_plain_splits = TT.grad(TT.sum(
                [TT.sum(g * x) for g, x in zip(constraint_grads, xs)]),
                                      wrt=params,
                                      disconnected_inputs='warn')
            return TT.concatenate([TT.flatten(s) for s in hx_plain_splits])

        self.opt_fun = ext.LazyDict(
            f_hx_plain=lambda: tensor_utils.compile_function(
                inputs=inputs + xs,
                outputs=hx_plain(),
                log_name="f_hx_plain",
            ), )
Exemplo n.º 13
0
    def update_opt(self,
                   loss,
                   target,
                   leq_constraint,
                   inputs,
                   extra_inputs=None,
                   constraint_name="constraint",
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should
         implement methods of the
         :class:`garage.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon),
         of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs, which could be
         subsampled if needed. It is assumed that the first dimension of these
         inputs should correspond to the number of data points
        :param extra_inputs: A list of symbolic variables as extra inputs which
         should not be subsampled
        :return: No return value.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params, disconnected_inputs='warn')
        flat_grad = tensor_utils.flatten_tensor_variables(grads)

        self._hvp_approach.update_opt(f=constraint_term,
                                      target=target,
                                      inputs=inputs + extra_inputs,
                                      reg_coeff=self._reg_coeff)

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        self._opt_fun = ext.LazyDict(
            f_loss=lambda: tensor_utils.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                log_name="f_loss",
            ),
            f_grad=lambda: tensor_utils.compile_function(
                inputs=inputs + extra_inputs,
                outputs=flat_grad,
                log_name="f_grad",
            ),
            f_constraint=lambda: tensor_utils.compile_function(
                inputs=inputs + extra_inputs,
                outputs=constraint_term,
                log_name="constraint",
            ),
            f_loss_constraint=lambda: tensor_utils.compile_function(
                inputs=inputs + extra_inputs,
                outputs=[loss, constraint_term],
                log_name="f_loss_constraint",
            ),
        )
Exemplo n.º 14
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=NL.rectify,
                 hidden_w_init=lasagne.init.HeUniform(),
                 hidden_b_init=lasagne.init.Constant(0.),
                 action_merge_layer=-2,
                 output_nonlinearity=None,
                 output_w_init=lasagne.init.Uniform(-3e-3, 3e-3),
                 output_b_init=lasagne.init.Uniform(-3e-3, 3e-3),
                 bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim),
                             name="obs")
        l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim),
                                name="actions")

        n_layers = len(hidden_sizes) + 1

        if n_layers > 1:
            action_merge_layer = \
                (action_merge_layer % n_layers + n_layers) % n_layers
        else:
            action_merge_layer = 1

        l_hidden = l_obs

        for idx, size in enumerate(hidden_sizes):
            if bn:
                l_hidden = batch_norm(l_hidden)

            if idx == action_merge_layer:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_hidden = L.DenseLayer(l_hidden,
                                    num_units=size,
                                    W=hidden_w_init,
                                    b=hidden_b_init,
                                    nonlinearity=hidden_nonlinearity,
                                    name="h%d" % (idx + 1))

        if action_merge_layer == n_layers:
            l_hidden = L.ConcatLayer([l_hidden, l_action])

        l_output = L.DenseLayer(l_hidden,
                                num_units=1,
                                W=output_w_init,
                                b=output_b_init,
                                nonlinearity=output_nonlinearity,
                                name="output")

        output_var = L.get_output(l_output, deterministic=True).flatten()

        self._f_qval = tensor_utils.compile_function(
            [l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action
        self._output_nonlinearity = output_nonlinearity

        LasagnePowered.__init__(self, [l_output])
Exemplo n.º 15
0
    def init_opt(self):
        observations_var = self.env.observation_space.new_tensor_variable(
            'observations', extra_dims=1)
        actions_var = self.env.action_space.new_tensor_variable('actions',
                                                                extra_dims=1)
        advantages_var = tensor_utils.new_tensor('advantage',
                                                 ndim=1,
                                                 dtype=theano.config.floatX)
        dist = self.policy.distribution
        dist_info_vars = self.policy.dist_info_sym(observations_var)
        old_dist_info_vars = self.backup_policy.dist_info_sym(observations_var)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
        mean_kl = TT.mean(kl)
        max_kl = TT.max(kl)

        pos_eps_dist_info_vars = self.pos_eps_policy.dist_info_sym(
            observations_var)
        neg_eps_dist_info_vars = self.neg_eps_policy.dist_info_sym(
            observations_var)
        mix_dist_info_vars = self.mix_policy.dist_info_sym(observations_var)

        surr = TT.sum(
            dist.log_likelihood_sym(actions_var, dist_info_vars) *
            advantages_var)
        surr_pos_eps = TT.sum(
            dist.log_likelihood_sym(actions_var, pos_eps_dist_info_vars) *
            advantages_var)
        surr_neg_eps = TT.sum(
            dist.log_likelihood_sym(actions_var, neg_eps_dist_info_vars) *
            advantages_var)
        surr_mix = TT.sum(
            dist.log_likelihood_sym(actions_var, mix_dist_info_vars) *
            advantages_var)
        surr_loglikelihood = TT.sum(
            dist.log_likelihood_sym(actions_var, mix_dist_info_vars))

        params = self.policy.get_params(trainable=True)
        mix_params = self.mix_policy.get_params(trainable=True)
        pos_eps_params = self.pos_eps_policy.get_params(trainable=True)
        neg_eps_params = self.neg_eps_policy.get_params(trainable=True)
        backup_params = self.backup_policy.get_params(trainable=True)

        grads = theano.grad(surr, params)
        grad_pos_eps = theano.grad(surr_pos_eps, pos_eps_params)
        grad_neg_eps = theano.grad(surr_neg_eps, neg_eps_params)
        grad_mix = theano.grad(surr_mix, mix_params)
        grad_mix_lh = theano.grad(surr_loglikelihood, mix_params)

        self.f_surr = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=surr)
        self.f_train = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grads)
        self.f_pos_grad = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grad_pos_eps)
        self.f_neg_grad = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grad_neg_eps)
        self.f_mix_grad = theano.function(
            inputs=[observations_var, actions_var, advantages_var],
            outputs=grad_mix)
        self.f_mix_lh = theano.function(inputs=[observations_var, actions_var],
                                        outputs=grad_mix_lh)
        #self.f_update = theano.function(
        #   inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5, eval_grad6, eval_grad7],
        #  outputs=None,
        # updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5, eval_grad6, eval_grad7], params,
        #            learning_rate=self.learning_rate)
        #)
        self.f_kl = tensor_utils.compile_function(
            inputs=[observations_var],
            outputs=[mean_kl, max_kl],
        )
        return dict()
Exemplo n.º 16
0
    def __init__(
            self,
            input_shape,
            output_dim,
            mean_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            std_nonlinearity=None,
            normalize_inputs=True,
            normalize_outputs=True,
            name=None,
            batchsize=None,
            subsample_factor=1.,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean
         network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the
         mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only
         effective if adaptive_std is False. If adaptive_std is True, this
         parameter is ignored, and the weights for the std network are always
         learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the
         std network. Only used if `std_share_network` is False. It defaults to
         the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std
         network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        self._batchsize = batchsize
        self._subsample_factor = subsample_factor

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self._optimizer = optimizer

        if mean_network is None:
            mean_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = MLP(
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(
            np.zeros((1, ) + input_shape, dtype=theano.config.floatX),
            name="x_mean",
            broadcastable=(True, ) + (False, ) * len(input_shape))
        x_std_var = theano.shared(
            np.ones((1, ) + input_shape, dtype=theano.config.floatX),
            name="x_std",
            broadcastable=(True, ) + (False, ) * len(input_shape))
        y_mean_var = theano.shared(
            np.zeros((1, output_dim), dtype=theano.config.floatX),
            name="y_mean",
            broadcastable=(True, False))
        y_std_var = theano.shared(
            np.ones((1, output_dim), dtype=theano.config.floatX),
            name="y_std",
            broadcastable=(True, False))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian(output_dim)

        normalized_dist_info_vars = dict(
            mean=normalized_means_var, log_std=normalized_log_stds_var)

        mean_kl = TT.mean(
            dist.kl_sym(
                dict(
                    mean=normalized_old_means_var,
                    log_std=normalized_old_log_stds_var),
                normalized_dist_info_vars,
            ))

        loss = - \
            TT.mean(dist.log_likelihood_sym(
                normalized_ys_var, normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var
            ]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._mean_network = mean_network
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
Exemplo n.º 17
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = tensor_utils.new_tensor(
            'advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: tensor_utils.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        state_info_vars = {
            k: tensor_utils.new_tensor(
                k, ndim=2 + is_recurrent, dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -TT.sum(
                logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = -TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var
                      ] + state_info_vars_list
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(
            surr_obj, target=self.policy, inputs=input_list)

        f_kl = tensor_utils.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )
Exemplo n.º 18
0
    def __init__(self,
                 env_spec,
                 hidden_dim=32,
                 feature_network=None,
                 state_include_action=True,
                 hidden_nonlinearity=NL.tanh):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)

        Serializable.quick_init(self, locals())
        super(CategoricalGRUPolicy, self).__init__(env_spec)

        obs_dim = env_spec.observation_space.flat_dim
        action_flat_dim = env_spec.action_space.flat_dim

        if state_include_action:
            input_dim = obs_dim + action_flat_dim
        else:
            input_dim = obs_dim

        l_input = L.InputLayer(shape=(None, None, input_dim), name="input")

        if feature_network is None:
            feature_dim = input_dim
            l_flat_feature = None
            l_feature = l_input
        else:
            feature_dim = feature_network.output_layer.output_shape[-1]
            l_flat_feature = feature_network.output_layer
            l_feature = OpLayer(
                l_flat_feature,
                extras=[l_input],
                name="reshape_feature",
                op=lambda flat_feature, input: TT.reshape(
                    flat_feature,
                    [input.shape[0], input.shape[1], feature_dim]),
                shape_op=lambda _, input_shape:
                (input_shape[0], input_shape[1], feature_dim))

        prob_network = GRUNetwork(input_shape=(feature_dim, ),
                                  input_layer=l_feature,
                                  output_dim=env_spec.action_space.n,
                                  hidden_dim=hidden_dim,
                                  hidden_nonlinearity=hidden_nonlinearity,
                                  output_nonlinearity=TT.nnet.softmax,
                                  name="prob_network")

        self.prob_network = prob_network
        self.feature_network = feature_network
        self.l_input = l_input
        self.state_include_action = state_include_action

        flat_input_var = TT.matrix("flat_input")
        if feature_network is None:
            feature_var = flat_input_var
        else:
            feature_var = L.get_output(
                l_flat_feature, {feature_network.input_layer: flat_input_var})

        self.f_step_prob = tensor_utils.compile_function(
            [flat_input_var, prob_network.step_prev_hidden_layer.input_var],
            L.get_output([
                prob_network.step_output_layer, prob_network.step_hidden_layer
            ], {prob_network.step_input_layer: feature_var}))

        self.input_dim = input_dim
        self.action_flat_dim = action_flat_dim
        self.hidden_dim = hidden_dim

        self.prev_action = None
        self.prev_hidden = None
        self.dist = RecurrentCategorical(env_spec.action_space.n)

        out_layers = [prob_network.output_layer]
        if feature_network is not None:
            out_layers.append(feature_network.output_layer)

        LasagnePowered.__init__(self, out_layers)
Exemplo n.º 19
0
    def __init__(
            self,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean
         network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the
         mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(
            np.zeros((1, ) + input_shape),
            name="x_mean",
            broadcastable=(True, ) + (False, ) * len(input_shape))
        x_std_var = theano.shared(
            np.ones((1, ) + input_shape),
            name="x_std",
            broadcastable=(True, ) + (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob,
                                {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = tensor_utils.to_onehot_sym(
            TT.argmax(prob_var, axis=1), output_dim)

        self._f_predict = tensor_utils.compile_function([xs_var], predicted)
        self._f_prob = tensor_utils.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Exemplo n.º 20
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        # Init dual param values
        self.param_eta = 15.
        # Adjust for linear feature vector.
        self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 +
                                      4)

        # Theano vars
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        rewards = theano_tensor_utils.new_tensor(
            'rewards',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX,
        )
        # Feature difference variable representing the difference in feature
        # value of the next observation and the current observation \phi(s') -
        # \phi(s).
        feat_diff = theano_tensor_utils.new_tensor(
            'feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX)
        param_v = TT.vector('param_v')
        param_eta = TT.scalar('eta')

        valid_var = TT.matrix('valid')

        state_info_vars = {
            k: theano_tensor_utils.new_tensor(
                k, ndim=2 + is_recurrent, dtype=theano.config.floatX)
            for k in self.policy.state_info_keys
        }
        state_info_vars_list = [
            state_info_vars[k] for k in self.policy.state_info_keys
        ]

        # Policy-related symbolics
        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        dist = self.policy.distribution
        # log of the policy dist
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)

        # Symbolic sample Bellman error
        delta_v = rewards + TT.dot(feat_diff, param_v)

        # Policy loss (negative because we minimize)
        if is_recurrent:
            loss = -TT.sum(logli * TT.exp(delta_v / param_eta - TT.max(
                delta_v / param_eta)) * valid_var) / TT.sum(valid_var)
        else:
            loss = -TT.mean(logli * TT.exp(delta_v / param_eta -
                                           TT.max(delta_v / param_eta)))

        # Add regularization to loss.
        reg_params = self.policy.get_params(regularizable=True)
        loss += self.L2_reg_loss * TT.sum(
            [TT.mean(TT.square(param))
             for param in reg_params]) / len(reg_params)

        # Policy loss gradient.
        loss_grad = TT.grad(loss, self.policy.get_params(trainable=True))

        if is_recurrent:
            recurrent_vars = [valid_var]
        else:
            recurrent_vars = []

        input = [
            rewards, obs_var, feat_diff, action_var
        ] + state_info_vars_list + recurrent_vars + [param_eta, param_v]
        # if is_recurrent:
        #     input +=
        f_loss = theano_tensor_utils.compile_function(
            inputs=input,
            outputs=loss,
        )
        f_loss_grad = theano_tensor_utils.compile_function(
            inputs=input,
            outputs=loss_grad,
        )

        # Debug prints
        old_dist_info_vars = {
            k: theano_tensor_utils.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            mean_kl = TT.sum(
                dist.kl_sym(old_dist_info_vars, dist_info_vars) *
                valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars))

        f_kl = theano_tensor_utils.compile_function(
            inputs=[obs_var, action_var] + state_info_vars_list +
            old_dist_info_vars_list + recurrent_vars,
            outputs=mean_kl,
        )

        # Dual-related symbolics
        # Symbolic dual
        if is_recurrent:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.sum(
                           TT.exp(
                            delta_v / param_eta - TT.max(delta_v / param_eta)
                           ) * valid_var
                       ) / TT.sum(valid_var)
                   ) + param_eta * TT.max(delta_v / param_eta)
        else:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.mean(
                           TT.exp(
                            delta_v / param_eta - TT.max(delta_v / param_eta)
                           )
                       )
                   ) + param_eta * TT.max(delta_v / param_eta)
        # Add L2 regularization.
        dual += self.L2_reg_dual * \
                (TT.square(param_eta) + TT.square(1 / param_eta))

        # Symbolic dual gradient
        dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v])

        # Eval functions.
        f_dual = theano_tensor_utils.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars
            + [param_eta, param_v],
            outputs=dual)
        f_dual_grad = theano_tensor_utils.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars
            + [param_eta, param_v],
            outputs=dual_grad)

        self.opt_info = dict(
            f_loss_grad=f_loss_grad,
            f_loss=f_loss,
            f_dual=f_dual,
            f_dual_grad=f_dual_grad,
            f_kl=f_kl)
Exemplo n.º 21
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            mean_network=None,
            std_network=None,
            dist_cls=DiagonalGaussian,
    ):
        """
        :param env_spec:
        :param hidden_sizes: sizes list for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: sizes list for the fully-connected layers
         for std
        :param min_std: whether to make sure that the std is at least some
         threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        assert isinstance(env_spec.action_space, Box)

        Serializable.quick_init(self, locals())

        obs_dim = env_spec.observation_space.flat_dim
        action_flat_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim, ),
                output_dim=action_flat_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_flat_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_flat_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_flat_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = tensor_utils.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Exemplo n.º 22
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, ),
        state_include_action=True,
        hidden_nonlinearity=NL.tanh,
        learn_std=True,
        init_std=1.0,
        output_nonlinearity=None,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: sizes list for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Box)

        Serializable.quick_init(self, locals())
        super(GaussianGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            obs_dim = env_spec.observation_space.flat_dim +\
                env_spec.action_space.flat_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim
        action_flat_dim = env_spec.action_space.flat_dim

        mean_network = GRUNetwork(
            input_shape=(obs_dim, ),
            output_dim=action_flat_dim,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_flat_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        l_step_log_std = ParamLayer(
            mean_network.step_input_layer,
            num_units=action_flat_dim,
            param=l_log_std.param,
            name="step_output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._l_log_std = l_log_std
        self._state_include_action = state_include_action

        self._f_step_mean_std = tensor_utils.compile_function(
            [
                mean_network.step_input_layer.input_var,
                mean_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                mean_network.step_output_layer, l_step_log_std,
                mean_network.step_hidden_layer
            ]))

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentDiagonalGaussian(action_flat_dim)

        self.reset()

        LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])