Exemplo n.º 1
0
    def initialize(self, inputs, losses, constraints, target,
            givens=None, lr_mult=1):
        self._target = target
        loss = sum(losses)
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore')

        # Phase 1: Compute gradient and save to GPU vector
        flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients)
        self._shared_grad = shared_grad

        # Phase 2: All-reduce gradient in-place in shared_grad, then reshape
        gradients, avg_factor_var = avg_grads_from_flat(shared_grad, params)
        self._avg_factor_var = avg_factor_var  # (set later as 1 / n_gpu)

        # Phase 3: Apply combined gradient locally
        gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)

        self._f_grad = ext.compile_function(
            inputs=inputs,
            outputs=loss,
            updates=[flat_update],
            givens=givens,
            log_name="gradient",
        )
        self._f_update = ext.compile_function(
            inputs=[],
            outputs=grad_norm,
            updates=updates,
            log_name="update",
        )
Exemplo n.º 2
0
    def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self._target = target

        updates = self._update_method(loss, target.get_params(trainable=True))
        updates = OrderedDict([(k, v.astype(k.dtype))
                               for k, v in updates.iteritems()])

        if extra_inputs is None:
            extra_inputs = list()

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(inputs + extra_inputs, loss),
            f_opt=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                updates=updates,
            ))
Exemplo n.º 3
0
    def initialize(self, env_spec, **kwargs):
        # Wait to do this until GPU is initialized.
        assert isinstance(env_spec.action_space, Discrete)
        s = retrieve_args(self)

        network = PgCnn(
            input_shape=env_spec.observation_space.shape,
            output_dim=env_spec.action_space.n,
            conv_filters=s.conv_filters,
            conv_filter_sizes=s.conv_filter_sizes,
            conv_strides=s.conv_strides,
            conv_pads=s.conv_pads,
            hidden_sizes=s.hidden_sizes,
            hidden_nonlinearity=s.hidden_nonlinearity,
            output_pi_nonlinearity=s.output_pi_nonlinearity,
            pixel_scale=s.pixel_scale,
            name="atari_cnn",
        )

        self._l_obs = network.input_layer
        input_var = network.input_layer.input_var

        prob, value = L.get_output(network.output_layers)

        self._f_prob = ext.compile_function([input_var], prob)
        self._f_value = ext.compile_function([input_var], value)
        self._f_prob_value = ext.compile_function([input_var], [prob, value])

        self._dist = Categorical(env_spec.action_space.n)
        self._network = network
        super().initialize(env_spec, network=network, **kwargs)
        self.param_short_names = \
            [shorten_param_name(p.name) for p in network.get_params(trainable=True)]
    def init_opt(self):

        # First, create "target" policy and Q functions
        target_policy = pickle.loads(pickle.dumps(self.policy))
        target_qf = pickle.loads(pickle.dumps(self.qf))

        # y need to be computed first
        obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )

        # The yi values are computed separately as above and then passed to
        # the training functions below
        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        yvar = TT.vector('ys')

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([TT.sum(TT.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)

        qf_loss = TT.mean(TT.square(yvar - qval))
        qf_reg_loss = qf_loss + qf_weight_decay_term

        policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([TT.sum(TT.square(param))
                                        for param in self.policy.get_params(regularizable=True)])

        policy_qval = self.qf.get_qval_sym(obs,
                                           self.policy.get_action_sym(obs),
                                           deterministic=True)

        policy_surr = -TT.mean(policy_qval)

        policy_reg_surr = policy_surr + policy_weight_decay_term

        qf_updates = self.qf_update_method(qf_reg_loss,
                                           self.qf.get_params(trainable=True))
        policy_updates = self.policy_update_method(
            policy_reg_surr, self.policy.get_params(trainable=True))

        f_train_qf = ext.compile_function(inputs=[yvar, obs, action],
                                          outputs=[qf_loss, qval],
                                          updates=qf_updates)

        f_train_policy = ext.compile_function(inputs=[obs],
                                              outputs=policy_surr,
                                              updates=policy_updates)

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_policy=f_train_policy,
            target_qf=target_qf,
            target_policy=target_policy,
        )
Exemplo n.º 5
0
    def initialize(self, inputs, loss, target, priority_expr,
            givens=None, lr_mult=1):
        self._target = target
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore")

        if self._scale_conv_grads:
            gradients = scale_conv_gradients(params, gradients,
                scale=2 ** (-1 / 2))

        # Compute gradient and save to GPU vector.
        flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients)
        self._shared_grad = shared_grad

        # All-reduce gradient in-place in shared_grad, then reshape
        gradients, avg_factor_var = avg_grads_from_flat(shared_grad, params)
        self._avg_factor_var = avg_factor_var

        gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)

        self._f_gradient = ext.compile_function(
            inputs=inputs,
            outputs=[priority_expr, loss],
            updates=[flat_update],
            givens=givens,
            log_name="gradient",
        )

        self._f_update = ext.compile_function(
            inputs=[],
            updates=updates,
            log_name="update",
        )
Exemplo n.º 6
0
    def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self._target = target

        updates = self._update_method(loss, target.get_params(trainable=True))
        updates = OrderedDict([(k, v.astype(k.dtype)) for k, v in list(updates.items())])

        if extra_inputs is None:
            extra_inputs = list()

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(inputs + extra_inputs, loss),
            f_opt=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                updates=updates,
            )
        )
Exemplo n.º 7
0
    def initialize(self,
                   inputs,
                   losses,
                   constraints,
                   target,
                   givens=None,
                   lr_mult=1):
        self._target = target
        loss = sum(losses)
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore')

        gradients, grad_norm = apply_grad_norm_clip(gradients,
                                                    self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)

        # Prepare to load data onto GPU (and shuffle indexes there).
        load_updates, givens, opt_inputs = make_shared_inputs(
            inputs, self._shuffle)

        self._f_load = ext.compile_function(
            inputs=inputs,
            updates=load_updates,
            log_name="load",
        )
        self._f_opt = ext.compile_function(
            inputs=opt_inputs,
            outputs=[loss, grad_norm],
            updates=updates,
            givens=givens,
            log_name="grad_and_update",
        )
Exemplo n.º 8
0
    def update_opt(self, loss, target, inputs, extra_inputs=None, *args, **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self._target = target

        def get_opt_output():
            flat_grad = flatten_tensor_variables(theano.grad(loss, target.get_params(trainable=True)))
            return [loss.astype('float64'), flat_grad.astype('float64')]

        if extra_inputs is None:
            extra_inputs = list()

        self._opt_fun = lazydict(
            f_loss=lambda: compile_function(inputs + extra_inputs, loss),
            f_opt=lambda: compile_function(
                inputs=inputs + extra_inputs,
                outputs=get_opt_output(),
            )
        )
Exemplo n.º 9
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self._target = target

        def get_opt_output():
            flat_grad = flatten_tensor_variables(
                theano.grad(loss, target.get_params(trainable=True)))
            return [loss.astype('float64'), flat_grad.astype('float64')]

        if extra_inputs is None:
            extra_inputs = list()

        self._opt_fun = lazydict(
            f_loss=lambda: compile_function(inputs + extra_inputs, loss),
            f_opt=lambda: compile_function(
                inputs=inputs + extra_inputs,
                outputs=get_opt_output(),
            ))
Exemplo n.º 10
0
    def initialize(self, env_spec, alternating_sampler=False):
        # Wait to do this until GPU is initialized.
        assert isinstance(env_spec.action_space, Discrete)
        s = retrieve_args(self)

        network = PgCnnRnn(
            input_shape=env_spec.observation_space.shape,
            output_dim=env_spec.action_space.n,
            conv_filters=s.conv_filters,
            conv_filter_sizes=s.conv_filter_sizes,
            conv_strides=s.conv_strides,
            conv_pads=s.conv_pads,
            conv_nonlinearity=s.conv_nonlinearity,
            hidden_sizes=s.hidden_sizes,
            hidden_nonlinearity=s.hidden_nonlinearity,
            output_pi_nonlinearity=s.output_pi_nonlinearity,
            pixel_scale=s.pixel_scale,
            name="atari_cnn_rnn",
        )

        self._l_obs = network.input_layer

        input_var = network.input_layer.input_var
        prev_hidden_vars = [lay.input_var for lay in network.prev_hidden_layers]
        prob, value = L.get_output(network.output_layers,
            step_or_train="step")
        hidden_vars = L.get_output(network.recurrent_layers,
            step_or_train="step")

        self._f_act = ext.compile_function(
            inputs=[input_var] + prev_hidden_vars,
            outputs=[prob, value] + hidden_vars,
        )
        self._f_prob_value = ext.compile_function(
            inputs=[input_var] + prev_hidden_vars,
            outputs=[prob, value],
        )
        self._f_prob = ext.compile_function(
            inputs=[input_var] + prev_hidden_vars,
            outputs=prob,
        )
        self._f_value = ext.compile_function(
            inputs=[input_var] + prev_hidden_vars,
            outputs=value,
        )
        self._f_hidden = ext.compile_function(
            inputs=[input_var] + prev_hidden_vars,
            outputs=hidden_vars,
        )

        self._dist = Categorical(env_spec.action_space.n)

        super().initialize(env_spec, network, alternating_sampler)
        self.param_short_names = \
            [shorten_param_name(p.name) for p in network.get_params(trainable=True)]
        self._network = network
        self._hprev_keys = ["hprev_{}".format(i)
            for i in range(len(network.recurrent_layers))]
        self.hid_init_params = self._network.hid_init_params
    def update_opt(
        self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs
    ):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed
        that the first dimension of these inputs should correspond to the number of data points
        :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled
        :return: No return value.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params)
        flat_grad = ext.flatten_tensor_variables(grads)

        constraint_grads = theano.grad(constraint_term, wrt=params)
        xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
        Hx_plain_splits = TT.grad(TT.sum([TT.sum(g * x) for g, x in itertools.izip(constraint_grads, xs)]), wrt=params)
        Hx_plain = TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        if self._debug_nan:
            from theano.compile.nanguardmode import NanGuardMode

            mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
        else:
            mode = None

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", mode=mode
            ),
            f_grad=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", mode=mode
            ),
            f_Hx_plain=lambda: ext.compile_function(
                inputs=inputs + extra_inputs + xs, outputs=Hx_plain, log_name="f_Hx_plain", mode=mode
            ),
            f_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", mode=mode
            ),
            f_loss_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", mode=mode
            ),
        )
Exemplo n.º 12
0
    def initialize(self, inputs, losses, constraints, target, lr_mult=1):
        self._target = target
        loss = sum(losses)
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs='ignore')

        gradients, grad_norm = apply_grad_norm_clip(gradients, self._grad_norm_clip)

        # Phase 1: Compute gradient and save to GPU vector
        flat_grad, shared_grad, flat_update = flat_shared_grad(target, gradients)

        # Phase 2: apply gradient chunks to update central params; e.g. rmsprop
        lr = self._learning_rate * lr_mult
        if self.n_update_chunks > 1:
            updates_args = (shared_grad, lr, self.n_update_chunks, self._update_method_args)
            chunk_inputs, outputs_list, updates, idxs = \
                chunked_updates(self._update_method_name, updates_args)
            self._chunk_idxs = idxs
        else:
            whole_inputs, whole_outputs, whole_updates = \
                whole_update(self._update_method_name, shared_grad, lr, self._update_method_args)

        # Phase 3: copy new param values from shared_grad to params
        copy_updates = copy_params_from_flat(params, shared_grad)

        # Phase 1
        self._f_gradient = ext.compile_function(
            inputs=inputs,
            outputs=[loss, grad_norm],
            updates=[flat_update],
            log_name="gradient",
        )

        # Phase 2
        if self.n_update_chunks > 1:
            f_update_chunks = list()
            for i, (outputs, update) in enumerate(zip(outputs_list, updates)):
                f_update_chunks.append(ext.compile_function(
                    inputs=chunk_inputs,
                    outputs=outputs,
                    updates=[update],
                    log_name="update_chunk_{}".format(i))
                )
            self._f_update_chunks = f_update_chunks
        else:
            self._f_update = ext.compile_function(
                inputs=whole_inputs,
                outputs=whole_outputs,
                updates=whole_updates,
                log_name="update",
            )

        # Phase 3
        self._f_copy = ext.compile_function(
            inputs=[],
            updates=copy_updates,
            log_name="copy_params",
        )
Exemplo n.º 13
0
Arquivo: hf.py Projeto: zhmz90/rllab
    def __init__(self, _p, inputs, s, costs, h=None, ha=None):
        '''Constructs and compiles the necessary Theano functions.

        p : list of Theano shared variables
            Parameters of the model to be optimized.
        inputs : list of Theano variables
            Symbolic variables that are inputs to your graph (they should also
            include your model 'output'). Your training examples must fit these.
        s : Theano variable
            Symbolic variable with respect to which the Hessian of the objective is
            positive-definite, implicitly defining the Gauss-Newton matrix. Typically,
            it is the activation of the output layer.
        costs : list of Theano variables
            Monitoring costs, the first of which will be the optimized objective.
        h: Theano variable or None
            Structural damping is applied to this variable (typically the hidden units
            of an RNN).
        ha: Theano variable or None
            Symbolic variable that implicitly defines the Gauss-Newton matrix for the
            structural damping term (typically the activation of the hidden layer). If
            None, it will be set to `h`.'''

        self.p = _p
        self.shapes = [i.get_value().shape for i in _p]
        self.sizes = map(numpy.prod, self.shapes)
        self.positions = numpy.cumsum([0] + self.sizes)[:-1]

        g = T.grad(costs[0], _p)
        g = map(T.as_tensor_variable, g)  # for CudaNdarray
        self.f_gc = compile_function(inputs,
                                     g + costs)  # during gradient computation
        self.f_cost = compile_function(inputs,
                                       costs)  # for quick cost evaluation

        symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4

        v = [symbolic_types[len(i)]() for i in self.shapes]
        Gv = gauss_newton_product(costs[0], _p, v, s)

        coefficient = T.scalar()  # this is lambda*mu
        if h is not None:  # structural damping with cross-entropy
            h_constant = symbolic_types[h.ndim](
            )  # T.Rop does not support `consider_constant` yet, so use `givens`
            structural_damping = coefficient * (
                -h_constant * T.log(h + 1e-10) -
                (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0]
            if ha is None: ha = h
            Gv_damping = gauss_newton_product(structural_damping, _p, v, ha)
            Gv = [a + b for a, b in zip(Gv, Gv_damping)]
            givens = {h_constant: h}
        else:
            givens = {}

        self.function_Gv = compile_function(inputs + v + [coefficient],
                                            Gv,
                                            givens=givens)
Exemplo n.º 14
0
    def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed
        that the first dimension of these inputs should correspond to the number of data points
        :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled
        :return: No return value.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params, disconnected_inputs='warn')
        flat_grad = ext.flatten_tensor_variables(grads)

        self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs,
                                      reg_coeff=self._reg_coeff)

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                log_name="f_loss",
            ),
            f_grad=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=flat_grad,
                log_name="f_grad",
            ),
            f_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=constraint_term,
                log_name="constraint",
            ),
            f_loss_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=[loss, constraint_term],
                log_name="f_loss_constraint",
            ),
        )
    def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed
        that the first dimension of these inputs should correspond to the number of data points
        :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled
        :return: No return value.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params, disconnected_inputs='warn')
        flat_grad = ext.flatten_tensor_variables(grads)

        self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs,
                                      reg_coeff=self._reg_coeff)

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                log_name="f_loss",
            ),
            f_grad=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=flat_grad,
                log_name="f_grad",
            ),
            f_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=constraint_term,
                log_name="constraint",
            ),
            f_loss_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=[loss, constraint_term],
                log_name="f_loss_constraint",
            ),
        )
    def __init__(
            self,
            env_spec,
            hidden_sizes=(),
            hidden_nonlinearity=NL.tanh,
            num_seq_inputs=1,
            neat_output_dim=20,
            neat_network=None,
            prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)
        # create random NEAT MLP
        if neat_network is None:
            neat_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,),
                output_dim=neat_output_dim,
                hidden_sizes=(12, 12),
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.identity,
            )

        if prob_network is None:
            prob_network = MLP(
                input_shape=(L.get_output_shape(neat_network.output_layer)[1],),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._phi = neat_network.output_layer
        self._obs = neat_network.input_layer
        self._neat_output = ext.compile_function([neat_network.input_layer.input_var], L.get_output(neat_network.output_layer))

        self.prob_network = prob_network
        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)

        super(PowerGradientPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 17
0
    def __init__(self, _p, inputs, s, costs, h=None, ha=None):
        '''Constructs and compiles the necessary Theano functions.

        p : list of Theano shared variables
            Parameters of the model to be optimized.
        inputs : list of Theano variables
            Symbolic variables that are inputs to your graph (they should also
            include your model 'output'). Your training examples must fit these.
        s : Theano variable
            Symbolic variable with respect to which the Hessian of the objective is
            positive-definite, implicitly defining the Gauss-Newton matrix. Typically,
            it is the activation of the output layer.
        costs : list of Theano variables
            Monitoring costs, the first of which will be the optimized objective.
        h: Theano variable or None
            Structural damping is applied to this variable (typically the hidden units
            of an RNN).
        ha: Theano variable or None
            Symbolic variable that implicitly defines the Gauss-Newton matrix for the
            structural damping term (typically the activation of the hidden layer). If
            None, it will be set to `h`.'''

        self.p = _p
        self.shapes = [i.get_value().shape for i in _p]
        self.sizes = list(map(numpy.prod, self.shapes))
        self.positions = numpy.cumsum([0] + self.sizes)[:-1]

        g = T.grad(costs[0], _p)
        g = list(map(T.as_tensor_variable, g))  # for CudaNdarray
        self.f_gc = compile_function(inputs, g + costs)  # during gradient computation
        self.f_cost = compile_function(inputs, costs)  # for quick cost evaluation

        symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4

        v = [symbolic_types[len(i)]() for i in self.shapes]
        Gv = gauss_newton_product(costs[0], _p, v, s)

        coefficient = T.scalar()  # this is lambda*mu
        if h is not None:  # structural damping with cross-entropy
            h_constant = symbolic_types[h.ndim]()  # T.Rop does not support `consider_constant` yet, so use `givens`
            structural_damping = coefficient * (
                -h_constant * T.log(h + 1e-10) - (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0]
            if ha is None: ha = h
            Gv_damping = gauss_newton_product(structural_damping, _p, v, ha)
            Gv = [a + b for a, b in zip(Gv, Gv_damping)]
            givens = {h_constant: h}
        else:
            givens = {}

        self.function_Gv = compile_function(inputs + v + [coefficient], Gv, givens=givens)
Exemplo n.º 18
0
    def update_opt(self,
                   loss,
                   target,
                   leq_constraint,
                   inputs,
                   constraint_name="constraint",
                   *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """
        constraint_terms = [c[0] for c in leq_constraint]
        constraint_values = [c[1] for c in leq_constraint]
        penalty_var = TT.scalar("penalty")
        penalty_loss = constraint_terms[0]
        for i in range(1, len(constraint_terms)):
            penalty_loss += constraint_terms[i]
        penalized_loss = loss + penalty_var * penalty_loss

        self._target = target
        self._max_constraint_vals = np.array(constraint_values)
        self._constraint_name = constraint_name

        def get_opt_output():
            flat_grad = flatten_tensor_variables(
                theano.grad(penalized_loss,
                            target.get_params(trainable=True),
                            disconnected_inputs='ignore'))
            return [
                penalized_loss.astype('float64'),
                flat_grad.astype('float64')
            ]

        self._opt_fun = lazydict(
            f_loss=lambda: compile_function(inputs, loss, log_name="f_loss"),
            f_constraint=lambda: compile_function(
                inputs, penalty_loss, log_name="f_constraint"),
            f_penalized_loss=lambda: compile_function(
                inputs=inputs + [penalty_var],
                outputs=[penalized_loss, loss] + constraint_terms,
                log_name="f_penalized_loss",
            ),
            f_opt=lambda: compile_function(inputs=inputs + [penalty_var],
                                           outputs=get_opt_output(),
                                           log_name="f_opt"))
Exemplo n.º 19
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   diagnostic_vars=None,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self.target = target

        if diagnostic_vars is None:
            diagnostic_vars = OrderedDict()

        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, params, disconnected_inputs='ignore')
        if self.gradient_clipping is not None:
            gradients = [
                TT.clip(g, -self.gradient_clipping, self.gradient_clipping)
                for g in gradients
            ]

        updates = self._update_method(gradients,
                                      target.get_params(trainable=True))
        updates = OrderedDict([(k, v.astype(k.dtype))
                               for k, v in updates.items()])

        if extra_inputs is None:
            extra_inputs = list()

        self.input_vars = inputs + extra_inputs
        self.f_train = ext.compile_function(
            inputs=self.input_vars,
            outputs=[loss] + list(diagnostic_vars.values()),
            updates=updates,
        )
        self.f_loss_diagostics = ext.compile_function(
            inputs=self.input_vars,
            outputs=[loss] + list(diagnostic_vars.values()),
        )
        self.diagnostic_vars = diagnostic_vars
Exemplo n.º 20
0
    def update_opt(self, f, target, inputs, reg_coeff):
        self.target = target
        self.reg_coeff = reg_coeff
        params = target.get_params(trainable=True)

        constraint_grads = theano.grad(
            f, wrt=params, disconnected_inputs='warn')
        xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])

        def Hx_plain():
            Hx_plain_splits = TT.grad(
                TT.sum([TT.sum(g * x)
                        for g, x in zip(constraint_grads, xs)]),
                wrt=params,
                disconnected_inputs='warn'
            )
            return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])

        self.opt_fun = ext.lazydict(
            f_Hx_plain=lambda: ext.compile_function(
                inputs=inputs + xs,
                outputs=Hx_plain(),
                log_name="f_Hx_plain",
            ),
        )
Exemplo n.º 21
0
    def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim,),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 22
0
    def initialize(self,
                   inputs,
                   loss,
                   target,
                   priority_expr,
                   givens=None,
                   lr_mult=1):
        self._target = target
        params = target.get_params(trainable=True)
        gradients = theano.grad(loss, wrt=params, disconnected_inputs="ignore")

        if self._scale_conv_grads:  # (for dueling network architecture)
            gradients = scale_conv_gradients(params,
                                             gradients,
                                             scale=2**(-1 / 2))

        gradients, grad_norm = apply_grad_norm_clip(gradients,
                                                    self._grad_norm_clip)
        lr = self._learning_rate * lr_mult  # (lr_mult can be shared variable)
        updates = self._update_method(gradients, params, learning_rate=lr)
        self._f_opt = ext.compile_function(
            inputs=inputs,
            outputs=[priority_expr, loss],
            updates=updates,
            givens=givens,
            log_name="grad_and_update",
        )
    def update_opt(self, f, target, inputs, reg_coeff):
        self.target = target
        self.reg_coeff = reg_coeff
        params = target.get_params(trainable=True)

        constraint_grads = theano.grad(
            f, wrt=params, disconnected_inputs='warn')
        xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])

        def Hx_plain():
            Hx_plain_splits = TT.grad(
                TT.sum([TT.sum(g * x)
                        for g, x in zip(constraint_grads, xs)]),
                wrt=params,
                disconnected_inputs='warn'
            )
            return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])

        self.opt_fun = ext.lazydict(
            f_Hx_plain=lambda: ext.compile_function(
                inputs=inputs + xs,
                outputs=Hx_plain(),
                log_name="f_Hx_plain",
            ),
        )
Exemplo n.º 24
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor(
            'advantage',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX
        )
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = - TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list)

        f_kl = ext.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(
            f_kl=f_kl,
        )
Exemplo n.º 25
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1 + is_recurrent,
                                       dtype=theano.config.floatX)
        dist = self.policy.distribution
        old_dist_info_vars = {
            k: ext.new_tensor('old_%s' % k,
                              ndim=2 + is_recurrent,
                              dtype=theano.config.floatX)
            for k in dist.dist_info_keys
        }
        old_dist_info_vars_list = [
            old_dist_info_vars[k] for k in dist.dist_info_keys
        ]

        if is_recurrent:
            valid_var = TT.matrix('valid')
        else:
            valid_var = None

        dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)
        kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)

        # formulate as a minimization problem
        # The gradient of the surrogate objective is the policy gradient
        if is_recurrent:
            surr_obj = -TT.sum(
                logli * advantage_var * valid_var) / TT.sum(valid_var)
            mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
            max_kl = TT.max(kl * valid_var)
        else:
            surr_obj = -TT.mean(logli * advantage_var)
            mean_kl = TT.mean(kl)
            max_kl = TT.max(kl)

        input_list = [obs_var, action_var, advantage_var]
        if is_recurrent:
            input_list.append(valid_var)

        self.optimizer.update_opt(surr_obj,
                                  target=self.policy,
                                  inputs=input_list)

        f_kl = ext.compile_function(
            inputs=input_list + old_dist_info_vars_list,
            outputs=[mean_kl, max_kl],
        )
        self.opt_info = dict(f_kl=f_kl, )
Exemplo n.º 26
0
    def initialize(self, env_spec, **kwargs):
        # Wait to do this until GPU is initialized
        assert isinstance(env_spec.action_space, Discrete)
        s = retrieve_args(self)

        network_args = dict(
            input_shape=env_spec.observation_space.shape,
            output_dim=env_spec.action_space.n,
            conv_filters=s.conv_filters,
            conv_filter_sizes=s.conv_filter_sizes,
            conv_strides=s.conv_strides,
            conv_pads=s.conv_pads,
            hidden_sizes=s.hidden_sizes,
            hidden_nonlinearity=s.hidden_nonlinearity,
            pixel_scale=s.pixel_scale,
            dueling=s.dueling,
            shared_last_bias=s.shared_last_bias,
        )
        policy_network = DqnCnn(name="policy", **network_args)
        target_network = DqnCnn(name="target", **network_args)

        self._l_obs = policy_network.input_layer
        self._l_target_obs = target_network.input_layer
        self._policy_network = policy_network
        self._target_network = target_network

        input_var = policy_network.input_layer.input_var
        q = L.get_output(policy_network.output_layer)
        self._f_q = ext.compile_function([input_var], q)
        greedy_action = T.argmax(q, axis=1)
        self._f_a = ext.compile_function([input_var], greedy_action)

        target_input_var = target_network.input_layer.input_var
        target_q = L.get_output(target_network.output_layer)
        self._f_target_q = ext.compile_function([target_input_var], target_q)

        policy_params = policy_network.get_params(trainable=True)
        target_params = target_network.get_params(trainable=True)
        updates = [(t, p) for p, t in zip(policy_params, target_params)]
        self._f_update_target = ext.compile_function(inputs=[], updates=updates)
        self._f_update_target()

        super().initialize(env_spec, network=policy_network, **kwargs)
        self._epsilon = s.epsilon
        self.param_short_names = \
            [shorten_param_name(p.name) for p in policy_params]
Exemplo n.º 27
0
    def __init__(self, wrapped_constraint, 
                       env_spec, 
                       yield_zeros_until=1,
                       optimizer=None, 
                       hidden_sizes=(32,), 
                       hidden_nonlinearity=NL.sigmoid, 
                       lag_time=10, 
                       coeff=1.,
                       filter_bonuses=False,
                       max_epochs=25,
                       *args, **kwargs):

        Serializable.quick_init(self,locals())

        self._wrapped_constraint = wrapped_constraint
        self._env_spec = env_spec
        self._filter_bonuses = filter_bonuses
        self._yield_zeros_until = yield_zeros_until
        self._hidden_sizes = hidden_sizes
        self._lag_time = lag_time
        self._coeff = coeff
        self._max_epochs = max_epochs
        self.use_bonus = True

        if optimizer is None:
            #optimizer = LbfgsOptimizer()
            optimizer = FirstOrderOptimizer(max_epochs=max_epochs, batch_size=None)

        self._optimizer = optimizer

        obs_dim = env_spec.observation_space.flat_dim

        predictor_network = MLP(1,hidden_sizes,hidden_nonlinearity,NL.sigmoid,
                                     input_shape=(obs_dim,))

        LasagnePowered.__init__(self, [predictor_network.output_layer])

        x_var = predictor_network.input_layer.input_var
        y_var = TT.matrix("ys")
        out_var = L.get_output(predictor_network.output_layer, 
                               {predictor_network.input_layer: x_var})

        regression_loss = TT.mean(TT.square(y_var - out_var))

        optimizer_args = dict(
            loss=regression_loss,
            target=self,
            inputs=[x_var, y_var],
        )

        self._optimizer.update_opt(**optimizer_args)
        self._f_predict = compile_function([x_var],out_var)

        self._fit_steps = 0

        self.has_baseline = self._wrapped_constraint.has_baseline
        if self.has_baseline:
            self.baseline = self._wrapped_constraint.baseline
Exemplo n.º 28
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(32, ),
                 state_include_action=True,
                 hidden_nonlinearity=NL.tanh,
                 output_b_init=None,
                 weight_signal=1.0,
                 weight_nonsignal=1.0,
                 weight_smc=1.0):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(InitCategoricalGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        output_b_init = compute_output_b_init(env_spec.action_space.names,
                                              output_b_init, weight_signal,
                                              weight_nonsignal, weight_smc)

        if state_include_action:
            input_shape = (env_spec.observation_space.flat_dim +
                           env_spec.action_space.flat_dim, )
        else:
            input_shape = (env_spec.observation_space.flat_dim, )

        prob_network = InitGRUNetwork(input_shape=input_shape,
                                      output_dim=env_spec.action_space.n,
                                      hidden_dim=hidden_sizes[0],
                                      hidden_nonlinearity=hidden_nonlinearity,
                                      output_nonlinearity=NL.softmax,
                                      output_b_init=output_b_init)

        self._prob_network = prob_network
        self._state_include_action = state_include_action

        self._f_step_prob = ext.compile_function(
            [
                prob_network.step_input_layer.input_var,
                prob_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                prob_network.step_output_layer, prob_network.step_hidden_layer
            ]))

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentCategorical(env_spec.action_space.n)

        self.reset()

        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 29
0
    def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args,
                   **kwargs):

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params, disconnected_inputs='warn')
        flat_grad = ext.flatten_tensor_variables(grads)

        self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs,
                                      reg_coeff=self._reg_coeff)

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                log_name="f_loss",
            ),
            f_grad=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=flat_grad,
                log_name="f_grad",
            ),
            f_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=constraint_term,
                log_name="constraint",
            ),
            f_loss_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=[loss, constraint_term],
                log_name="f_loss_constraint",
            ),
        )
Exemplo n.º 30
0
    def __init__(self, env_spec, tau=0.9):
        self._baseline = theano.shared(0.0, name="baseline")

        return_mean = T.scalar("empirical_return_mean")
        updated_baseline = \
                tau * self._baseline \
                  + (1 - tau) * return_mean

        self._update_baseline = compile_function(
            inputs=[return_mean], updates={self._baseline: updated_baseline})
Exemplo n.º 31
0
 def incorporate_z(self, z):
     """ Called from the algo while initializing """
     self.z = z  # (a Theano shared variable, 1-D tensor)
     assert len(z.get_value()) == self.n_atoms
     actions = self.actions_sym()
     obs_var = self._l_obs.input_var
     self._f_a = ext.compile_function(
         inputs=[obs_var],
         outputs=actions,
     )
Exemplo n.º 32
0
    def __init__(self, env_spec, tau=0.9):
        self._baseline = theano.shared(0.0, name="baseline")

        return_mean = T.scalar("empirical_return_mean")
        updated_baseline = \
                tau * self._baseline \
                  + (1 - tau) * return_mean

        self._update_baseline = compile_function(
            inputs=[return_mean],
            updates={self._baseline: updated_baseline})
Exemplo n.º 33
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32,),
            state_include_action=True,
            hidden_nonlinearity=NL.tanh):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(CategoricalGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            input_shape = (env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim,)
        else:
            input_shape = (env_spec.observation_space.flat_dim,)

        prob_network = GRUNetwork(
            input_shape=input_shape,
            output_dim=env_spec.action_space.n,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.softmax,
        )

        self._prob_network = prob_network
        self._state_include_action = state_include_action

        self._f_step_prob = ext.compile_function(
            [
                prob_network.step_input_layer.input_var,
                prob_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                prob_network.step_output_layer,
                prob_network.step_hidden_layer
            ])
        )

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentCategorical(env_spec.action_space.n)

        self.reset()

        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 34
0
def test_gru_network():
    from rllab.core.network import GRUNetwork
    import lasagne.layers as L
    from rllab.misc import ext
    import numpy as np
    network = GRUNetwork(
        input_shape=(2, 3),
        output_dim=5,
        hidden_dim=4,
    )
    f_output = ext.compile_function(inputs=[network.input_layer.input_var],
                                    outputs=L.get_output(network.output_layer))
    assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5)
Exemplo n.º 35
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            hidden_W_init=LI.HeUniform(),
            hidden_b_init=LI.Constant(0.),
            output_nonlinearity=NL.tanh,
            output_W_init=LI.Uniform(-3e-3, 3e-3),
            output_b_init=LI.Uniform(-3e-3, 3e-3),
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))

        l_hidden = l_obs
        if bn:
            l_hidden = batch_norm(l_hidden)

        for idx, size in enumerate(hidden_sizes):
            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_W_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % idx
            )
            if bn:
                l_hidden = batch_norm(l_hidden)

        l_output = L.DenseLayer(
            l_hidden,
            num_units=env_spec.action_space.flat_dim,
            W=output_W_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers

        action_var = L.get_output(l_output, deterministic=True)
        self._output_layer = l_output

        self._f_actions = ext.compile_function([l_obs.input_var], action_var)

        super(DeterministicMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [l_output])
Exemplo n.º 36
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            hidden_W_init=LI.HeUniform(),
            hidden_b_init=LI.Constant(0.),
            output_nonlinearity=NL.tanh,
            output_W_init=LI.Uniform(-3e-3, 3e-3),
            output_b_init=LI.Uniform(-3e-3, 3e-3),
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim))

        l_hidden = l_obs
        if bn:
            l_hidden = batch_norm(l_hidden)

        for idx, size in enumerate(hidden_sizes):
            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_W_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % idx
            )
            if bn:
                l_hidden = batch_norm(l_hidden)

        l_output = L.DenseLayer(
            l_hidden,
            num_units=env_spec.action_space.flat_dim,
            W=output_W_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers

        action_var = L.get_output(l_output, deterministic=True)
        self._output_layer = l_output

        self._f_actions = ext.compile_function([l_obs.input_var], action_var)

        super(DeterministicMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [l_output])
Exemplo n.º 37
0
    def __init__(
            self,
            env_spec,
            conv_filters, conv_filter_sizes, conv_strides, conv_pads,
            hidden_sizes=[],
            hidden_nonlinearity=NL.rectify,
            output_nonlinearity=NL.softmax,
            prob_network=None,
            name=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        self._env_spec = env_spec

        if prob_network is None:
            if not name:
                name = "categorical_conv_prob_network"
            prob_network = ConvNetwork(
                input_shape=env_spec.observation_space.shape,
                output_dim=env_spec.action_space.n,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name=name,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer)
        )

        self._dist = Categorical(env_spec.action_space.n)

        super(CategoricalConvPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
    def __init__(
        self,
        env_spec,
        latent_dim=0,  # all this is fake
        latent_name='categorical',
        bilinear_integration=False,
        resample=False,  # until here
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        #bullshit
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self._set_std_to_0 = False
        # self._set_std_to_0 = True

        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Discrete)

        if prob_network is None:
            prob_network = MLP(
                input_shape=(env_spec.observation_space.flat_dim, ),
                output_dim=env_spec.action_space.n,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        self._l_prob = prob_network.output_layer
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))

        self._dist = Categorical(env_spec.action_space.n)
        self._layers = prob_network.layers  # Rui: added layers for function get_params()

        super(CategoricalMLPPolicy, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 39
0
    def update_opt(self, loss, target, leq_constraint, inputs, constraint_name="constraint", *args, **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """
        constraint_term, constraint_value = leq_constraint
        penalty_var = TT.scalar("penalty")
        penalized_loss = loss + penalty_var * constraint_term

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name

        def get_opt_output():
            flat_grad = flatten_tensor_variables(theano.grad(
                penalized_loss, target.get_params(trainable=True), disconnected_inputs='ignore'
            ))
            return [penalized_loss.astype('float64'), flat_grad.astype('float64')]

        self._opt_fun = lazydict(
            f_loss=lambda: compile_function(inputs, loss, log_name="f_loss"),
            f_constraint=lambda: compile_function(inputs, constraint_term, log_name="f_constraint"),
            f_penalized_loss=lambda: compile_function(
                inputs=inputs + [penalty_var],
                outputs=[penalized_loss, loss, constraint_term],
                log_name="f_penalized_loss",
            ),
            f_opt=lambda: compile_function(
                inputs=inputs + [penalty_var],
                outputs=get_opt_output(),
                log_name="f_opt"
            )
        )
Exemplo n.º 40
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.tanh,
        num_seq_inputs=1,
        prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other network params
        are ignored
        :return:
        """
        Serializable.quick_init(self, locals())

        assert isinstance(env_spec.action_space, Product)
        self._Da = env_spec.action_space.comp_dim
        self._actnum = len(self._Da)
        self._slice = [0] + np.cumsum(self._Da).tolist()

        if prob_network is None:
            prob_network = MLP2(
                input_shape=(env_spec.observation_space.flat_dim *
                             num_seq_inputs, ),
                output_sizes=self._Da,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        #self._l_prob = prob_network.output_layer
        self._l_prob = prob_network.output_layer
        #self._logits = [prob_network.output[:, self._slice[i]:self._slice[i + 1]] for i in range(self._actnum)]
        #l_probs = [NL.softmax(logit) for logit in self._logits]
        #self._l_prob = it.product(l_probs)
        self._l_obs = prob_network.input_layer
        self._f_prob = ext.compile_function(
            [prob_network.input_layer.input_var],
            L.get_output(prob_network.output_layer))
        # self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output(
        #     self._l_probs))

        self._dist = Categorical2(self._Da)

        super(CategoricalMLPPolicy2, self).__init__(env_spec)
        LasagnePowered.__init__(self, [prob_network.output_layer])
Exemplo n.º 41
0
    def update_opt(self,
                   loss,
                   target,
                   inputs,
                   extra_inputs=None,
                   gradients=None,
                   **kwargs):
        self._target = target

        if gradients is None:
            gradients = theano.grad(loss,
                                    target.get_params(trainable=True),
                                    disconnected_inputs='ignore')
        flat_grad = ext.flatten_tensor_variables(gradients)

        if extra_inputs is None:
            extra_inputs = list()

        self._opt_fun = ext.lazydict(f_loss=lambda: ext.compile_function(
            inputs + extra_inputs, loss, log_name=self._name + "_f_loss"),
                                     f_grad=lambda: ext.compile_function(
                                         inputs=inputs + extra_inputs,
                                         outputs=flat_grad,
                                         log_name=self._name + "_f_grad"))
Exemplo n.º 42
0
def test_gru_network():
    from rllab.core.network import GRUNetwork
    import lasagne.layers as L
    from rllab.misc import ext
    import numpy as np
    network = GRUNetwork(
        input_shape=(2, 3),
        output_dim=5,
        hidden_dim=4,
    )
    f_output = ext.compile_function(
        inputs=[network.input_layer.input_var],
        outputs=L.get_output(network.output_layer)
    )
    assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5)
Exemplo n.º 43
0
    def init_opt(self):
        """
        Initialize the optimization procedure. If using theano, this may
        include declaring all the variables and compiling functions.
        """
        # specify target policy
        target_policy = pickle.loads(pickle.dumps(self.policy))
        target_var = TT.vector('target', dtype=theano.config.floatX)

        # building network
        obs_var = self.env.observation_space.new_tensor_variable('obs', extra_dims=1)
        action_var = self.env.action_space.new_tensor_variable('action', extra_dims=1)
        _, qval_var_all_dict = self.policy.get_action_sym(obs_var)
        qval_var_all = qval_var_all_dict["action_values"]
        qval_var = qval_var_all[TT.arange(qval_var_all.shape[0]), action_var]
        #gradient clipping
        diff = target_var - qval_var
        if self.clip_gradient > 0:
            quadratic_part = TT.minimum(abs(diff), self.clip_gradient)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * TT.square(quadratic_part) + self.clip_gradient * linear_part
        else:
            loss = 0.5 * TT.square(diff)
        loss_var = TT.mean(loss)
        params = self.policy.get_params(trainable=True)
        updates = self.update_method(loss_var, params)

        # debugging functions
        # also uncomment mode=theano.compile.MonitorMode(pre_func=inspect_inputs, post_func=inspect_outputs)
        # def inspect_inputs(i, node, fn):
        #     print(i, node, "input(s) shape(s):", [input[0].shape for input in fn.inputs],end='')
        # def inspect_outputs(i, node, fn):
        #     print(" output(s) shape(s):", [output[0].shape for output in fn.outputs])

        f_train_policy = ext.compile_function(
            inputs=[obs_var, action_var, target_var],
            outputs=[loss_var, qval_var],
            updates=updates,
            name='f_train_policy',
            # mode=theano.compile.MonitorMode(pre_func=inspect_inputs, post_func=inspect_outputs)
        )

        self.opt_info = dict(
            f_train_policy=f_train_policy,
            target_policy=target_policy
        )
    def update_opt(self, f, target, inputs, reg_coeff):
        self.target = target
        self.reg_coeff = reg_coeff

        params = target.get_params(trainable=True)

        constraint_grads = theano.grad(
            f, wrt=params, disconnected_inputs='warn')
        flat_grad = ext.flatten_tensor_variables(constraint_grads)

        def f_Hx_plain(*args):
            inputs_ = args[:len(inputs)]
            xs = args[len(inputs):]
            flat_xs = np.concatenate([np.reshape(x, (-1,)) for x in xs])
            param_val = self.target.get_param_values(trainable=True)
            eps = np.cast['float32'](
                self.base_eps / (np.linalg.norm(param_val) + 1e-8))
            self.target.set_param_values(
                param_val + eps * flat_xs, trainable=True)
            flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_)
            if self.symmetric:
                self.target.set_param_values(
                    param_val - eps * flat_xs, trainable=True)
                flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_)
                hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps)
                self.target.set_param_values(param_val, trainable=True)
            else:
                self.target.set_param_values(param_val, trainable=True)
                flat_grad = self.opt_fun["f_grad"](*inputs_)
                hx = (flat_grad_dvplus - flat_grad) / eps
            return hx

        self.opt_fun = ext.lazydict(
            f_grad=lambda: ext.compile_function(
                inputs=inputs,
                outputs=flat_grad,
                log_name="f_grad",
            ),
            f_Hx_plain=lambda: f_Hx_plain,
        )
Exemplo n.º 45
0
    def update_opt(self, loss, target, inputs, network_outputs, extra_inputs=None):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param inputs: A list of symbolic variables as inputs
        :return: No return value.
        """

        self._target = target

        if extra_inputs is None:
            extra_inputs = list()

        self._hf_optimizer = hf_optimizer(
            _p=target.get_params(trainable=True),
            inputs=(inputs + extra_inputs),
            s=network_outputs,
            costs=[loss],
        )

        self._opt_fun = lazydict(
            f_loss=lambda: compile_function(inputs + extra_inputs, loss),
        )
Exemplo n.º 46
0
    def __init__(
            self,
            input_shape,
            output_dim,
            prob_network=None,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            normalize_inputs=True,
            name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(
            np.zeros((1,) + input_shape),
            name="x_mean",
            broadcastable=(True,) + (False, ) * len(input_shape)
        )
        x_std_var = theano.shared(
            np.ones((1,) + input_shape),
            name="x_std",
            broadcastable=(True,) + (False, ) * len(input_shape)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim)

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
Exemplo n.º 47
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.rectify,
            hidden_W_init=lasagne.init.HeUniform(),
            hidden_b_init=lasagne.init.Constant(0.),
            action_merge_layer=-2,
            output_nonlinearity=None,
            output_W_init=lasagne.init.Uniform(-3e-3, 3e-3),
            output_b_init=lasagne.init.Uniform(-3e-3, 3e-3),
            bn=False):
        Serializable.quick_init(self, locals())

        l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
        l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")

        n_layers = len(hidden_sizes) + 1

        if n_layers > 1:
            action_merge_layer = \
                (action_merge_layer % n_layers + n_layers) % n_layers
        else:
            action_merge_layer = 1

        l_hidden = l_obs

        for idx, size in enumerate(hidden_sizes):
            if bn:
                l_hidden = batch_norm(l_hidden)

            if idx == action_merge_layer:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_hidden = L.DenseLayer(
                l_hidden,
                num_units=size,
                W=hidden_W_init,
                b=hidden_b_init,
                nonlinearity=hidden_nonlinearity,
                name="h%d" % (idx + 1)
            )

        if action_merge_layer == n_layers:
            l_hidden = L.ConcatLayer([l_hidden, l_action])

        l_output = L.DenseLayer(
            l_hidden,
            num_units=1,
            W=output_W_init,
            b=output_b_init,
            nonlinearity=output_nonlinearity,
            name="output"
        )

        output_var = L.get_output(l_output, deterministic=True).flatten()

        self._f_qval = ext.compile_function([l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action
        self._output_nonlinearity = output_nonlinearity

        LasagnePowered.__init__(self, [l_output])
Exemplo n.º 48
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32,),
            state_include_action=True,
            hidden_nonlinearity=NL.tanh,
            learn_std=True,
            init_std=1.0,
            output_nonlinearity=None,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_sizes: list of sizes for the fully connected hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        Serializable.quick_init(self, locals())
        super(GaussianGRUPolicy, self).__init__(env_spec)

        assert len(hidden_sizes) == 1

        if state_include_action:
            obs_dim = env_spec.observation_space.flat_dim + env_spec.action_space.flat_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        mean_network = GRUNetwork(
            input_shape=(obs_dim,),
            output_dim=action_dim,
            hidden_dim=hidden_sizes[0],
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=NL.softmax,
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        l_step_log_std = ParamLayer(
            mean_network.step_input_layer,
            num_units=action_dim,
            param=l_log_std.param,
            name="step_output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._l_log_std = l_log_std
        self._state_include_action = state_include_action

        self._f_step_mean_std = ext.compile_function(
            [
                mean_network.step_input_layer.input_var,
                mean_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                mean_network.step_output_layer,
                l_step_log_std,
                mean_network.step_hidden_layer
            ])
        )

        self._prev_action = None
        self._prev_hidden = None
        self._hidden_sizes = hidden_sizes
        self._dist = RecurrentDiagonalGaussian(action_dim)

        self.reset()

        LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
    def __init__(
            self,
            name,
            input_shape,
            output_dim,
            hidden_sizes,
            conv_filters,conv_filter_sizes,conv_strides,conv_pads,
            hidden_nonlinearity=NL.rectify,
            mean_network=None,

            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            subsample_factor=1.0,
            batchsize=None,

            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_conv_filters=[],std_conv_filters_sizes=[],std_conv_strides=[],std_conv_pads=[],
            std_hidden_sizes=(32, 32),
            std_nonlinearity=None,
            normalize_inputs=True,
            normalize_outputs=True,
    ):
        """
        :param input_shape: usually for images of the form (width,height,channel)
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())


        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer("optimizer")
            else:
                optimizer = LbfgsOptimizer("optimizer")

        self._optimizer = optimizer

        self.input_shape = input_shape
        if mean_network is None:
            mean_network = ConvNetwork(
                name="mean_network",
                input_shape=input_shape,
                output_dim=output_dim,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = ConvNetwork(
                name="log_std_network",
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                conv_filters=std_conv_filters,
                conv_filter_sizes=std_conv_filter_sizes,
                conv_strides=std_conv_strides,
                conv_pads=std_conv_pads,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(
            np.zeros((1,np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_mean",
            broadcastable=(True,False),
        )
        x_std_var = theano.shared(
            np.ones((1,np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_std",
            broadcastable=(True,False),
        )
        y_mean_var = theano.shared(
            np.zeros((1, output_dim), dtype=theano.config.floatX),
            name="y_mean",
            broadcastable=(True, False)
        )
        y_std_var = theano.shared(
            np.ones((1, output_dim), dtype=theano.config.floatX),
            name="y_std",
            broadcastable=(True, False)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian(output_dim)

        normalized_dist_info_vars = dict(
            mean=normalized_means_var, log_std=normalized_log_stds_var)

        mean_kl = TT.mean(dist.kl_sym(
            dict(mean=normalized_old_means_var,
                 log_std=normalized_old_log_stds_var),
            normalized_dist_info_vars,
        ))

        loss = - \
            TT.mean(dist.log_likelihood_sym(
                normalized_ys_var, normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._mean_network = mean_network
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
        self._subsample_factor = subsample_factor
        self._batchsize = batchsize
Exemplo n.º 50
0
Arquivo: bnn.py Projeto: jpdoyle/vime
    def build_model(self):

        # Prepare Theano variables for inputs and targets
        # Same input for classification as regression.
        input_var = T.matrix('inputs',
                             dtype=theano.config.floatX)  # @UndefinedVariable
        target_var = T.matrix('targets',
                              dtype=theano.config.floatX)  # @UndefinedVariable

        # Loss function.
        loss = self.loss(input_var, target_var)
        loss_only_last_sample = self.loss_last_sample(input_var, target_var)

        # Create update methods.
        params = lasagne.layers.get_all_params(self.network, trainable=True)
        updates = lasagne.updates.adam(
            loss, params, learning_rate=self.learning_rate)

        # Train/val fn.
        self.pred_fn = ext.compile_function(
            [input_var], self.pred_sym(input_var), log_name='pred_fn')
        self.train_fn = ext.compile_function(
            [input_var, target_var], loss, updates=updates, log_name='train_fn')

        if self.second_order_update:

            oldparams = lasagne.layers.get_all_params(
                self.network, oldparam=True)
            step_size = T.scalar('step_size',
                                 dtype=theano.config.floatX)  # @UndefinedVariable

            def second_order_update(loss_or_grads, params, oldparams, step_size):
                """Second-order update method for optimizing loss_last_sample, so basically,
                KL term (new params || old params) + NLL of latest sample. The Hessian is
                evaluated at the origin and provides curvature information to make a more
                informed step in the correct descent direction."""
                grads = T.grad(loss_or_grads, params)
                updates = OrderedDict()
                for i in xrange(len(params)):
                    param = params[i]
                    grad = grads[i]
                    if param.name == 'mu' or param.name == 'b_mu':
                        oldparam_rho = oldparams[i + 1]
                        invH = T.square(T.log(1 + T.exp(oldparam_rho)))
                    else:
                        oldparam_rho = oldparams[i]
                        p = param

                        H = 2. * (T.exp(2 * p)) / \
                            (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2)
                        invH = 1. / H
                    updates[param] = param - step_size * invH * grad

                return updates

            def fast_kl_div(loss, params, oldparams, step_size):

                grads = T.grad(loss, params)

                kl_component = []
                for i in xrange(len(params)):
                    param = params[i]
                    grad = grads[i]

                    if param.name == 'mu' or param.name == 'b_mu':
                        oldparam_rho = oldparams[i + 1]
                        invH = T.square(T.log(1 + T.exp(oldparam_rho)))
                    else:
                        oldparam_rho = oldparams[i]
                        p = param

                        H = 2. * (T.exp(2 * p)) / \
                            (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2)
                        invH = 1. / H

                    kl_component.append(
                        T.sum(T.square(step_size) * T.square(grad) * invH))

                return sum(kl_component)

            compute_fast_kl_div = fast_kl_div(
                loss_only_last_sample, params, oldparams, step_size)

            self.train_update_fn = ext.compile_function(
                [input_var, target_var, step_size], compute_fast_kl_div, log_name='f_compute_fast_kl_div')

#             updates_kl = second_order_update(
#                 loss_only_last_sample, params, oldparams, step_size)
#
#             self.train_update_fn = ext.compile_function(
#                 [input_var, target_var, step_size], loss_only_last_sample, updates=updates_kl, log_name='train_update_fn')
        else:
            self.train_update_fn = ext.compile_function(
                [input_var, target_var], loss_only_last_sample, updates=updates, log_name='train_update_fn')

        # called kl div closed form but should be called surprise
        self.f_kl_div_closed_form = ext.compile_function(
            [], self.surprise(), log_name='kl_div_fn')
Exemplo n.º 51
0
    def init_opt(self):
        is_recurrent = int(self.policy.recurrent)

        # Init dual param values
        self.param_eta = 15.
        # Adjust for linear feature vector.
        self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4)

        # Theano vars
        obs_var = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1 + is_recurrent,
        )
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1 + is_recurrent,
        )
        rewards = ext.new_tensor(
            'rewards',
            ndim=1 + is_recurrent,
            dtype=theano.config.floatX,
        )
        # Feature difference variable representing the difference in feature
        # value of the next observation and the current observation \phi(s') -
        # \phi(s).
        feat_diff = ext.new_tensor(
            'feat_diff',
            ndim=2 + is_recurrent,
            dtype=theano.config.floatX
        )
        param_v = TT.vector('param_v')
        param_eta = TT.scalar('eta')

        valid_var = TT.matrix('valid')

        state_info_vars = {
            k: ext.new_tensor(
                k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in self.policy.state_info_keys
        }
        state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]

        # Policy-related symbolics
        dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
        dist = self.policy.distribution
        # log of the policy dist
        logli = dist.log_likelihood_sym(action_var, dist_info_vars)

        # Symbolic sample Bellman error
        delta_v = rewards + TT.dot(feat_diff, param_v)

        # Policy loss (negative because we minimize)
        if is_recurrent:
            loss = - TT.sum(logli * TT.exp(
                delta_v / param_eta - TT.max(delta_v / param_eta)
            ) * valid_var) / TT.sum(valid_var)
        else:
            loss = - TT.mean(logli * TT.exp(
                delta_v / param_eta - TT.max(delta_v / param_eta)
            ))

        # Add regularization to loss.
        reg_params = self.policy.get_params(regularizable=True)
        loss += self.L2_reg_loss * TT.sum(
            [TT.mean(TT.square(param)) for param in reg_params]
        ) / len(reg_params)

        # Policy loss gradient.
        loss_grad = TT.grad(
            loss, self.policy.get_params(trainable=True))

        if is_recurrent:
            recurrent_vars = [valid_var]
        else:
            recurrent_vars = []

        input = [rewards, obs_var, feat_diff,
                 action_var] + state_info_vars_list + recurrent_vars + [param_eta, param_v]
        # if is_recurrent:
        #     input +=
        f_loss = ext.compile_function(
            inputs=input,
            outputs=loss,
        )
        f_loss_grad = ext.compile_function(
            inputs=input,
            outputs=loss_grad,
        )

        # Debug prints
        old_dist_info_vars = {
            k: ext.new_tensor(
                'old_%s' % k,
                ndim=2 + is_recurrent,
                dtype=theano.config.floatX
            ) for k in dist.dist_info_keys
            }
        old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]

        if is_recurrent:
            mean_kl = TT.sum(dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var)
        else:
            mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars))

        f_kl = ext.compile_function(
            inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars,
            outputs=mean_kl,
        )

        # Dual-related symbolics
        # Symbolic dual
        if is_recurrent:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.sum(
                           TT.exp(
                               delta_v / param_eta - TT.max(delta_v / param_eta)
                           ) * valid_var
                       ) / TT.sum(valid_var)
                   ) + param_eta * TT.max(delta_v / param_eta)
        else:
            dual = param_eta * self.epsilon + \
                   param_eta * TT.log(
                       TT.mean(
                           TT.exp(
                               delta_v / param_eta - TT.max(delta_v / param_eta)
                           )
                       )
                   ) + param_eta * TT.max(delta_v / param_eta)
        # Add L2 regularization.
        dual += self.L2_reg_dual * \
                (TT.square(param_eta) + TT.square(1 / param_eta))

        # Symbolic dual gradient
        dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v])

        # Eval functions.
        f_dual = ext.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v],
            outputs=dual
        )
        f_dual_grad = ext.compile_function(
            inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v],
            outputs=dual_grad
        )

        self.opt_info = dict(
            f_loss_grad=f_loss_grad,
            f_loss=f_loss,
            f_dual=f_dual,
            f_dual_grad=f_dual_grad,
            f_kl=f_kl
        )
    def __init__(
            self,
            env_spec,
            hidden_dim=32,
            feature_network=None,
            state_include_action=True,
            hidden_nonlinearity=NL.tanh):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(CategoricalGRUPolicy, self).__init__(env_spec)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        if state_include_action:
            input_dim = obs_dim + action_dim
        else:
            input_dim = obs_dim

        l_input = L.InputLayer(
            shape=(None, None, input_dim),
            name="input"
        )

        if feature_network is None:
            feature_dim = input_dim
            l_flat_feature = None
            l_feature = l_input
        else:
            feature_dim = feature_network.output_layer.output_shape[-1]
            l_flat_feature = feature_network.output_layer
            l_feature = OpLayer(
                l_flat_feature,
                extras=[l_input],
                name="reshape_feature",
                op=lambda flat_feature, input: TT.reshape(
                    flat_feature,
                    [input.shape[0], input.shape[1], feature_dim]
                ),
                shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)
            )

        prob_network = GRUNetwork(
            input_shape=(feature_dim,),
            input_layer=l_feature,
            output_dim=env_spec.action_space.n,
            hidden_dim=hidden_dim,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=TT.nnet.softmax,
            name="prob_network"
        )

        self.prob_network = prob_network
        self.feature_network = feature_network
        self.l_input = l_input
        self.state_include_action = state_include_action

        flat_input_var = TT.matrix("flat_input")
        if feature_network is None:
            feature_var = flat_input_var
        else:
            feature_var = L.get_output(l_flat_feature, {feature_network.input_layer: flat_input_var})

        self.f_step_prob = ext.compile_function(
            [
                flat_input_var,
                prob_network.step_prev_hidden_layer.input_var
            ],
            L.get_output([
                prob_network.step_output_layer,
                prob_network.step_hidden_layer
            ], {prob_network.step_input_layer: feature_var})
        )

        self.input_dim = input_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim

        self.prev_action = None
        self.prev_hidden = None
        self.dist = RecurrentCategorical(env_spec.action_space.n)

        out_layers = [prob_network.output_layer]
        if feature_network is not None:
            out_layers.append(feature_network.output_layer)

        LasagnePowered.__init__(self, out_layers)
Exemplo n.º 53
0
    def __init__(
            self,
            env_spec,
            hidden_sizes=(32, 32),
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        mean_network = MLP(
            input_shape=(obs_dim,),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
        )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        if adaptive_std:
            std_network = MLP(
                input_shape=(obs_dim,),
                input_layer=mean_network.input_layer,
                output_dim=action_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_hidden_nonlinearity,
                output_nonlinearity=None,
            )
            l_log_std = std_network.output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian()

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Exemplo n.º 54
0
    def init_opt(self):

        # First, create "target" policy and Q functions
        target_policy = pickle.loads(pickle.dumps(self.policy))
        target_qf = pickle.loads(pickle.dumps(self.qf))

        # y need to be computed first
        obs = self.env.observation_space.new_tensor_variable(
            'obs',
            extra_dims=1,
        )

        # The yi values are computed separately as above and then passed to
        # the training functions below
        action = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        yvar = TT.vector('ys')

        qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
                               sum([TT.sum(TT.square(param)) for param in
                                    self.qf.get_params(regularizable=True)])

        qval = self.qf.get_qval_sym(obs, action)

        qf_loss = TT.mean(TT.square(yvar - qval))
        qf_reg_loss = qf_loss + qf_weight_decay_term

        policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
                                   sum([TT.sum(TT.square(param))
                                        for param in self.policy.get_params(regularizable=True)])
        policy_qval = self.qf.get_qval_sym(
            obs, self.policy.get_action_sym(obs),
            deterministic=True
        )
        policy_surr = -TT.mean(policy_qval)

        policy_reg_surr = policy_surr + policy_weight_decay_term

        qf_updates = self.qf_update_method(
            qf_reg_loss, self.qf.get_params(trainable=True))
        policy_updates = self.policy_update_method(
            policy_reg_surr, self.policy.get_params(trainable=True))

        f_train_qf = ext.compile_function(
            inputs=[yvar, obs, action],
            outputs=[qf_loss, qval],
            updates=qf_updates
        )

        f_train_policy = ext.compile_function(
            inputs=[obs],
            outputs=policy_surr,
            updates=policy_updates
        )

        self.opt_info = dict(
            f_train_qf=f_train_qf,
            f_train_policy=f_train_policy,
            target_qf=target_qf,
            target_policy=target_policy,
        )