def init_opt(self): # First, create "target" policy and Q functions target_policy = pickle.loads(pickle.dumps(self.policy)) target_qf = pickle.loads(pickle.dumps(self.qf)) # y need to be computed first obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) # The yi values are computed separately as above and then passed to # the training functions below action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) yvar = TT.vector('ys') qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([TT.sum(TT.square(param)) for param in self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qf_loss = TT.mean(TT.square(yvar - qval)) qf_reg_loss = qf_loss + qf_weight_decay_term policy_weight_decay_term = 0.5 * self.policy_weight_decay * sum([ TT.sum(TT.square(param)) for param in self.policy.get_params(regularizable=True) ]) policy_qval = self.qf.get_qval_sym( obs, self.policy.get_action_sym(obs), deterministic=True) policy_surr = -TT.mean(policy_qval) policy_reg_surr = policy_surr + policy_weight_decay_term qf_updates = self.qf_update_method( qf_reg_loss, self.qf.get_params(trainable=True)) policy_updates = self.policy_update_method( policy_reg_surr, self.policy.get_params(trainable=True)) f_train_qf = ext.compile_function( inputs=[yvar, obs, action], outputs=[qf_loss, qval], updates=qf_updates) f_train_policy = ext.compile_function( inputs=[obs], outputs=policy_surr, updates=policy_updates) self.opt_info = dict( f_train_qf=f_train_qf, f_train_policy=f_train_policy, target_qf=target_qf, target_policy=target_policy, )
def __init__(self, _p, inputs, s, costs, h=None, ha=None): '''Constructs and compiles the necessary Theano functions. p : list of Theano shared variables Parameters of the model to be optimized. inputs : list of Theano variables Symbolic variables that are inputs to your graph (they should also include your model 'output'). Your training examples must fit these. s : Theano variable Symbolic variable with respect to which the Hessian of the objective is positive-definite, implicitly defining the Gauss-Newton matrix. Typically, it is the activation of the output layer. costs : list of Theano variables Monitoring costs, the first of which will be the optimized objective. h: Theano variable or None Structural damping is applied to this variable (typically the hidden units of an RNN). ha: Theano variable or None Symbolic variable that implicitly defines the Gauss-Newton matrix for the structural damping term (typically the activation of the hidden layer). If None, it will be set to `h`.''' self.p = _p self.shapes = [i.get_value().shape for i in _p] self.sizes = list(map(numpy.prod, self.shapes)) self.positions = numpy.cumsum([0] + self.sizes)[:-1] g = T.grad(costs[0], _p) g = list(map(T.as_tensor_variable, g)) # for CudaNdarray self.f_gc = compile_function(inputs, g + costs) # during gradient computation self.f_cost = compile_function(inputs, costs) # for quick cost evaluation symbolic_types = T.scalar, T.vector, T.matrix, T.tensor3, T.tensor4 v = [symbolic_types[len(i)]() for i in self.shapes] Gv = gauss_newton_product(costs[0], _p, v, s) coefficient = T.scalar() # this is lambda*mu if h is not None: # structural damping with cross-entropy h_constant = symbolic_types[h.ndim]( ) # T.Rop does not support `consider_constant` yet, so use `givens` structural_damping = coefficient * ( -h_constant * T.log(h + 1e-10) - (1 - h_constant) * T.log((1 - h) + 1e-10)).sum() / h.shape[0] if ha is None: ha = h Gv_damping = gauss_newton_product(structural_damping, _p, v, ha) Gv = [a + b for a, b in zip(Gv, Gv_damping)] givens = {h_constant: h} else: givens = {} self.function_Gv = compile_function(inputs + v + [coefficient], Gv, givens=givens)
def update_opt(self, loss, target, leq_constraint, inputs, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ constraint_term, constraint_value = leq_constraint penalty_var = TT.scalar("penalty") penalized_loss = loss + penalty_var * constraint_term self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name def get_opt_output(): flat_grad = flatten_tensor_variables( theano.grad( penalized_loss, target.get_params(trainable=True), disconnected_inputs='ignore')) return [ penalized_loss.astype('float64'), flat_grad.astype('float64') ] self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs, loss, log_name="f_loss"), f_constraint=lambda: compile_function( inputs, constraint_term, log_name="f_constraint"), f_penalized_loss=lambda: compile_function( inputs=inputs + [penalty_var], outputs=[penalized_loss, loss, constraint_term], log_name="f_penalized_loss", ), f_opt=lambda: compile_function( inputs=inputs + [penalty_var], outputs=get_opt_output(), log_name="f_opt"))
def update_opt(self, loss, target, inputs, network_outputs, extra_inputs=None): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target if extra_inputs is None: extra_inputs = list() self._hf_optimizer = hf_optimizer( _p=target.get_params(trainable=True), inputs=(inputs + extra_inputs), s=network_outputs, costs=[loss], ) self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), )
def test_gru_network(): network = GRUNetwork( input_shape=(2, 3), output_dim=5, hidden_dim=4, ) f_output = ext.compile_function( inputs=[network.input_layer.input_var], outputs=L.get_output(network.output_layer)) assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5)
def update_opt(self, loss, target, inputs, extra_inputs=None, gradients=None, *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :param gradients: symbolic expressions for the gradients of trainable parameters of the target. By default this will be computed by calling theano.grad :return: No return value. """ self._target = target def get_opt_output(gradients): if gradients is None: gradients = theano.grad(loss, target.get_params(trainable=True)) flat_grad = flatten_tensor_variables(gradients) return [loss.astype('float64'), flat_grad.astype('float64')] if extra_inputs is None: extra_inputs = list() self._opt_fun = lazydict( f_loss=lambda: compile_function(inputs + extra_inputs, loss), f_opt=lambda: compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(gradients), ))
def update_opt(self, loss, target, inputs, extra_inputs=None, gradients=None, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target if gradients is None: gradients = theano.grad( loss, target.get_params(trainable=True), disconnected_inputs='ignore') updates = self._update_method( gradients, target.get_params(trainable=True)) updates = OrderedDict( [(k, v.astype(k.dtype)) for k, v in updates.items()]) if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function(inputs + extra_inputs, loss), f_opt=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, updates=updates, ))
def __init__( self, name, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=[], hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, gym.spaces.Discrete) self._env_spec = env_spec if prob_network is None: prob_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(CategoricalConvPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, hidden_w_init=LI.HeUniform(), hidden_b_init=LI.Constant(0.), output_nonlinearity=NL.tanh, output_w_init=LI.Uniform(-3e-3, 3e-3), output_b_init=LI.Uniform(-3e-3, 3e-3), bn=False): Serializable.quick_init(self, locals()) l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim)) l_hidden = l_obs if bn: l_hidden = batch_norm(l_hidden) for idx, size in enumerate(hidden_sizes): l_hidden = L.DenseLayer( l_hidden, num_units=size, W=hidden_w_init, b=hidden_b_init, nonlinearity=hidden_nonlinearity, name="h%d" % idx) if bn: l_hidden = batch_norm(l_hidden) l_output = L.DenseLayer( l_hidden, num_units=env_spec.action_space.flat_dim, W=output_w_init, b=output_b_init, nonlinearity=output_nonlinearity, name="output") # Note the deterministic=True argument. It makes sure that when getting # actions from single observations, we do not update params in the # batch normalization layers action_var = L.get_output(l_output, deterministic=True) self._output_layer = l_output self._f_actions = ext.compile_function([l_obs.input_var], action_var) super(DeterministicMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [l_output])
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, num_seq_inputs=1, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: sizes list for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs, ), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = theano.grad( f, wrt=params, disconnected_inputs='warn') flat_grad = ext.flatten_tensor_variables(constraint_grads) def f_Hx_plain(*args): inputs_ = args[:len(inputs)] xs = args[len(inputs):] flat_xs = np.concatenate([np.reshape(x, (-1, )) for x in xs]) param_val = self.target.get_param_values(trainable=True) eps = np.cast['float32']( self.base_eps / (np.linalg.norm(param_val) + 1e-8)) self.target.set_param_values( param_val + eps * flat_xs, trainable=True) flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_) if self.symmetric: self.target.set_param_values( param_val - eps * flat_xs, trainable=True) flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps) self.target.set_param_values(param_val, trainable=True) else: self.target.set_param_values(param_val, trainable=True) flat_grad = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad) / eps return hx self.opt_fun = ext.lazydict( f_grad=lambda: ext.compile_function( inputs=inputs, outputs=flat_grad, log_name="f_grad", ), f_Hx_plain=lambda: f_Hx_plain, )
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = theano.grad( f, wrt=params, disconnected_inputs='warn') xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) def Hx_plain(): Hx_plain_splits = TT.grad( TT.sum([TT.sum(g * x) for g, x in zip(constraint_grads, xs)]), wrt=params, disconnected_inputs='warn') return TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self.opt_fun = ext.lazydict( f_Hx_plain=lambda: ext.compile_function( inputs=inputs + xs, outputs=Hx_plain(), log_name="f_Hx_plain", ), )
def init_opt(self): is_recurrent = int(self.policy.recurrent) # Init dual param values self.param_eta = 15. # Adjust for linear feature vector. self.param_v = np.random.rand(self.env.observation_space.flat_dim * 2 + 4) # Theano vars obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) rewards = ext.new_tensor( 'rewards', ndim=1 + is_recurrent, dtype=theano.config.floatX, ) # Feature difference variable representing the difference in feature # value of the next observation and the current observation \phi(s') - # \phi(s). feat_diff = ext.new_tensor('feat_diff', ndim=2 + is_recurrent, dtype=theano.config.floatX) param_v = TT.vector('param_v') param_eta = TT.scalar('eta') valid_var = TT.matrix('valid') state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] # Policy-related symbolics dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) dist = self.policy.distribution # log of the policy dist logli = dist.log_likelihood_sym(action_var, dist_info_vars) # Symbolic sample Bellman error delta_v = rewards + TT.dot(feat_diff, param_v) # Policy loss (negative because we minimize) if is_recurrent: loss = -TT.sum(logli * TT.exp(delta_v / param_eta - TT.max(delta_v / param_eta)) * valid_var) / TT.sum(valid_var) else: loss = -TT.mean(logli * TT.exp(delta_v / param_eta - TT.max(delta_v / param_eta))) # Add regularization to loss. reg_params = self.policy.get_params(regularizable=True) loss += self.L2_reg_loss * TT.sum( [TT.mean(TT.square(param)) for param in reg_params]) / len(reg_params) # Policy loss gradient. loss_grad = TT.grad(loss, self.policy.get_params(trainable=True)) if is_recurrent: recurrent_vars = [valid_var] else: recurrent_vars = [] input = [ rewards, obs_var, feat_diff, action_var ] + state_info_vars_list + recurrent_vars + [param_eta, param_v] # if is_recurrent: # input += f_loss = ext.compile_function( inputs=input, outputs=loss, ) f_loss_grad = ext.compile_function( inputs=input, outputs=loss_grad, ) # Debug prints old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: mean_kl = TT.sum( dist.kl_sym(old_dist_info_vars, dist_info_vars) * valid_var) / TT.sum(valid_var) else: mean_kl = TT.mean(dist.kl_sym(old_dist_info_vars, dist_info_vars)) f_kl = ext.compile_function( inputs=[obs_var, action_var] + state_info_vars_list + old_dist_info_vars_list + recurrent_vars, outputs=mean_kl, ) # Dual-related symbolics # Symbolic dual if is_recurrent: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.sum( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) * valid_var ) / TT.sum(valid_var) ) + param_eta * TT.max(delta_v / param_eta) else: dual = param_eta * self.epsilon + \ param_eta * TT.log( TT.mean( TT.exp( delta_v / param_eta - TT.max(delta_v / param_eta) ) ) ) + param_eta * TT.max(delta_v / param_eta) # Add L2 regularization. dual += self.L2_reg_dual * \ (TT.square(param_eta) + TT.square(1 / param_eta)) # Symbolic dual gradient dual_grad = TT.grad(cost=dual, wrt=[param_eta, param_v]) # Eval functions. f_dual = ext.compile_function(inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual) f_dual_grad = ext.compile_function( inputs=[rewards, feat_diff] + state_info_vars_list + recurrent_vars + [param_eta, param_v], outputs=dual_grad) self.opt_info = dict(f_loss_grad=f_loss_grad, f_loss=f_loss, f_dual=f_dual, f_dual_grad=f_dual_grad, f_kl=f_kl)
def __init__( self, input_shape, output_dim, mean_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, name=None, batchsize=None, subsample_factor=1., ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) self._batchsize = batchsize self._subsample_factor = subsample_factor if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self._optimizer = optimizer if mean_network is None: mean_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=None, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = MLP( input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=output_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LasagnePowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = TT.matrix("ys") old_means_var = TT.matrix("old_means") old_log_stds_var = TT.matrix("old_log_stds") x_mean_var = theano.shared(np.zeros((1, ) + input_shape, dtype=theano.config.floatX), name="x_mean", broadcastable=(True, ) + (False, ) * len(input_shape)) x_std_var = theano.shared(np.ones((1, ) + input_shape, dtype=theano.config.floatX), name="x_std", broadcastable=(True, ) + (False, ) * len(input_shape)) y_mean_var = theano.shared(np.zeros((1, output_dim), dtype=theano.config.floatX), name="y_mean", broadcastable=(True, False)) y_std_var = theano.shared(np.ones((1, output_dim), dtype=theano.config.floatX), name="y_std", broadcastable=(True, False)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output( l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output( l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + TT.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var) dist = self._dist = DiagonalGaussian(output_dim) normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = TT.mean( dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = - \ TT.mean(dist.log_likelihood_sym( normalized_ys_var, normalized_dist_info_vars)) self._f_predict = compile_function([xs_var], means_var) self._f_pdists = compile_function([xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std optimizer_args = dict( loss=loss, target=self, network_outputs=[normalized_means_var, normalized_log_stds_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [ xs_var, ys_var, old_means_var, old_log_stds_var ] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._mean_network = mean_network self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var
def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`garage.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled :return: No return value. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = theano.grad(loss, wrt=params, disconnected_inputs='warn') flat_grad = ext.flatten_tensor_variables(grads) self._hvp_approach.update_opt( f=constraint_term, target=target, inputs=inputs + extra_inputs, reg_coeff=self._reg_coeff) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", ), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", ), f_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", ), f_loss_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", ), )
def __init__(self, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=NL.tanh): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalGRUPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_flat_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_flat_dim else: input_dim = obs_dim l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: TT.reshape( flat_feature, [input.shape[0], input.shape[1], feature_dim]), shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)) prob_network = GRUNetwork( input_shape=(feature_dim, ), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=TT.nnet.softmax, name="prob_network") self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = TT.matrix("flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output( l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_prob = ext.compile_function( [flat_input_var, prob_network.step_prev_hidden_layer.input_var], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ], {prob_network.step_input_layer: feature_var})) self.input_dim = input_dim self.action_flat_dim = action_flat_dim self.hidden_dim = hidden_dim self.prev_action = None self.prev_hidden = None self.dist = RecurrentCategorical(env_spec.action_space.n) out_layers = [prob_network.output_layer] if feature_network is not None: out_layers.append(feature_network.output_layer) LasagnePowered.__init__(self, out_layers)
def __init__( self, env_spec, hidden_sizes=(32, ), state_include_action=True, hidden_nonlinearity=NL.tanh, learn_std=True, init_std=1.0, output_nonlinearity=None, ): """ :param env_spec: A spec for the env. :param hidden_sizes: sizes list for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ Serializable.quick_init(self, locals()) super(GaussianGRUPolicy, self).__init__(env_spec) assert len(hidden_sizes) == 1 if state_include_action: obs_dim = env_spec.observation_space.flat_dim +\ env_spec.action_space.flat_dim else: obs_dim = env_spec.observation_space.flat_dim action_flat_dim = env_spec.action_space.flat_dim mean_network = GRUNetwork( input_shape=(obs_dim, ), output_dim=action_flat_dim, hidden_dim=hidden_sizes[0], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) l_mean = mean_network.output_layer obs_var = mean_network.input_var l_log_std = ParamLayer( mean_network.input_layer, num_units=action_flat_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) l_step_log_std = ParamLayer( mean_network.step_input_layer, num_units=action_flat_dim, param=l_log_std.param, name="step_output_log_std", trainable=learn_std, ) self._mean_network = mean_network self._l_log_std = l_log_std self._state_include_action = state_include_action self._f_step_mean_std = ext.compile_function( [ mean_network.step_input_layer.input_var, mean_network.step_prev_hidden_layer.input_var ], L.get_output([ mean_network.step_output_layer, l_step_log_std, mean_network.step_hidden_layer ])) self._prev_action = None self._prev_hidden = None self._hidden_sizes = hidden_sizes self._dist = RecurrentDiagonalGaussian(action_flat_dim) self.reset() LasagnePowered.__init__(self, [mean_network.output_layer, l_log_std])
def __init__( self, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self.output_dim = output_dim self._optimizer = optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_prob = prob_network.output_layer LasagnePowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = TT.imatrix("ys") old_prob_var = TT.matrix("old_prob") x_mean_var = theano.shared( np.zeros((1, ) + input_shape), name="x_mean", broadcastable=(True, ) + (False, ) * len(input_shape)) x_std_var = theano.shared( np.ones((1, ) + input_shape), name="x_std", broadcastable=(True, ) + (False, ) * len(input_shape)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars)) loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = special.to_onehot_sym( TT.argmax(prob_var, axis=1), output_dim) self._f_predict = ext.compile_function([xs_var], predicted) self._f_prob = ext.compile_function([xs_var], prob_var) self._prob_network = prob_network self._l_prob = l_prob optimizer_args = dict( loss=loss, target=self, network_outputs=[prob_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var
def __init__( self, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, mean_network=None, std_network=None, dist_cls=DiagonalGaussian, ): """ :param env_spec: :param hidden_sizes: sizes list for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: sizes list for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_flat_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_flat_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_log_std = std_network.output_layer else: if adaptive_std: std_network = MLP( input_shape=(obs_dim, ), input_layer=mean_network.input_layer, output_dim=action_flat_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_log_std = std_network.output_layer else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_flat_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self.min_std = min_std mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(min_std)) self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_log_std = l_log_std self._dist = dist_cls(action_flat_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = ext.new_tensor('advantage', ndim=1 + is_recurrent, dtype=theano.config.floatX) dist = self.policy.distribution old_dist_info_vars = { k: ext.new_tensor('old_%s' % k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in dist.dist_info_keys } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] if is_recurrent: valid_var = TT.matrix('valid') else: valid_var = None state_info_vars = { k: ext.new_tensor(k, ndim=2 + is_recurrent, dtype=theano.config.floatX) for k in self.policy.state_info_keys } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -TT.sum( logli * advantage_var * valid_var) / TT.sum(valid_var) mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) max_kl = TT.max(kl * valid_var) else: surr_obj = -TT.mean(logli * advantage_var) mean_kl = TT.mean(kl) max_kl = TT.max(kl) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) f_kl = ext.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, hidden_w_init=lasagne.init.HeUniform(), hidden_b_init=lasagne.init.Constant(0.), action_merge_layer=-2, output_nonlinearity=None, output_w_init=lasagne.init.Uniform(-3e-3, 3e-3), output_b_init=lasagne.init.Uniform(-3e-3, 3e-3), bn=False): Serializable.quick_init(self, locals()) l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") n_layers = len(hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(hidden_sizes): if bn: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer(l_hidden, num_units=size, W=hidden_w_init, b=hidden_b_init, nonlinearity=hidden_nonlinearity, name="h%d" % (idx + 1)) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer(l_hidden, num_units=1, W=output_w_init, b=output_b_init, nonlinearity=output_nonlinearity, name="output") output_var = L.get_output(l_output, deterministic=True).flatten() self._f_qval = ext.compile_function( [l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action self._output_nonlinearity = output_nonlinearity LasagnePowered.__init__(self, [l_output])