class RecurrentCategorical(Distribution): def __init__(self, dim): self._cat = Categorical(dim) self._dim = dim @property def dim(self): return self._dim def kl_sym(self, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic KL divergence of two categorical distributions """ old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A return TT.sum(old_prob_var * (TT.log(old_prob_var + TINY) - TT.log(new_prob_var + TINY)), axis=2) def kl(self, old_dist_info, new_dist_info): """ Compute the KL divergence of two categorical distributions """ old_prob = old_dist_info["prob"] new_prob = new_dist_info["prob"] return np.sum(old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), axis=2) def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A a_dim = x_var.shape[-1] flat_ratios = self._cat.likelihood_ratio_sym( x_var.reshape((-1, a_dim)), dict(prob=old_prob_var.reshape((-1, a_dim))), dict(prob=new_prob_var.reshape((-1, a_dim))), ) return flat_ratios.reshape(old_prob_var.shape[:2]) def entropy(self, dist_info): probs = dist_info["prob"] return -np.sum(probs * np.log(probs + TINY), axis=2) def log_likelihood_sym(self, xs, dist_info_vars): probs = dist_info_vars["prob"] # Assume layout is N * T * A a_dim = probs.shape[-1] # a_dim = TT.printing.Print("lala")(a_dim) flat_logli = self._cat.log_likelihood_sym(xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim)))) return flat_logli.reshape(probs.shape[:2]) def log_likelihood(self, xs, dist_info): probs = dist_info["prob"] # Assume layout is N * T * A a_dim = probs.shape[-1] flat_logli = self._cat.log_likelihood_sym(xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim)))) return flat_logli.reshape(probs.shape[:2]) @property def dist_info_keys(self): return ["prob"]
def __init__( self, env_spec, hidden_sizes=(), hidden_nonlinearity=NL.tanh, num_seq_inputs=1, neat_output_dim=20, neat_network=None, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) # create random NEAT MLP if neat_network is None: neat_network = MLP( input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,), output_dim=neat_output_dim, hidden_sizes=(12, 12), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.identity, ) if prob_network is None: prob_network = MLP( input_shape=(L.get_output_shape(neat_network.output_layer)[1],), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._phi = neat_network.output_layer self._obs = neat_network.input_layer self._neat_output = ext.compile_function([neat_network.input_layer.input_var], L.get_output(neat_network.output_layer)) self.prob_network = prob_network self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function([prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(PowerGradientPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__( self, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=[], hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax, prob_network=None, name=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) self._env_spec = env_spec if prob_network is None: if not name: name = "categorical_conv_prob_network" prob_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name=name, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalConvPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, num_seq_inputs=1, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) self._env_spec = env_spec # print( env_spec.observation_space.shape ) q_network = MLP( input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.linear, name=name ) self._l_q = q_network.output_layer self._l_obs = q_network.input_layer self._f_q = ext.compile_function( [q_network.input_layer.input_var], L.get_output(q_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMlpQPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [q_network.output_layer])
def __init__(self, discrete_dim, chain_trigger, chain_distr): """ Args: discrete_dim: Cardinality of the categorical distribution. chain_trigger: Value of the categorical distribution which should trigger the chained distribution. chain_distribution: A child `Distribution` instance which is triggered when the parent categorical distribution selects the particular value `chain_trigger`. This should be a discrete distribution; bad things will happen if it is not one. """ self._prior_distr = Categorical(discrete_dim) self._chain_trigger = chain_trigger self._chain_distr = chain_distr # This is easier to code if the chain trigger is the final choice in # the categorical space. assert self._chain_trigger == discrete_dim - 1
def __init__( self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim, ), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
def __init__( self, name, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=[], hidden_nonlinearity=NL.rectify, output_nonlinearity=NL.softmax, prob_network=None, feature_layer_index=-2, eps=0, ): """ The policy consists of several convolution layers followed by fc layers and softmax :param env_spec: A spec for the mdp. :param conv_filters, conv_filter_sizes, conv_strides, conv_pads: specify the convolutional layers. See rllab.core.network.ConvNetwork for details. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :param feature_layer_index: index of the feature layer. Default -2 means the last layer before fc-softmax :param eps: mixture weight on uniform distribution; useful to force exploration :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) self._env_spec = env_spec if prob_network is None: prob_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer # mix in uniform distribution n_actions = env_spec.action_space.n uniform_prob = np.ones(n_actions,dtype=theano.config.floatX) / n_actions eps_var = theano.shared( eps, name="eps", ) nn_prob = L.get_output(prob_network.output_layer) final_prob = (1-eps_var) * nn_prob + eps_var * uniform_prob self._f_prob = ext.compile_function( [prob_network.input_layer.input_var], final_prob, ) self._eps_var = eps_var self._feature_layer_index = feature_layer_index feature_layer = L.get_all_layers(prob_network.output_layer)[feature_layer_index] # layer before fc-softmax self._f_feature = ext.compile_function( [prob_network.input_layer.input_var], L.get_output(feature_layer) ) self._feature_shape = L.get_output_shape(feature_layer)[1:] self._dist = Categorical(env_spec.action_space.n) super(CategoricalConvPolicy, self).__init__(env_spec) LasagnePowered.__init__(self, [prob_network.output_layer])
class RecurrentCategorical(Distribution): def __init__(self): self._cat = Categorical() def kl_sym(self, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic KL divergence of two categorical distributions """ old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A return TT.sum( old_prob_var * (TT.log(old_prob_var + TINY) - TT.log(new_prob_var + TINY)), axis=2) def kl(self, old_dist_info, new_dist_info): """ Compute the KL divergence of two categorical distributions """ old_prob = old_dist_info["prob"] new_prob = new_dist_info["prob"] return np.sum(old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), axis=2) def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A a_dim = x_var.shape[-1] flat_ratios = self._cat.likelihood_ratio_sym( x_var.reshape((-1, a_dim)), dict(prob=old_prob_var.reshape((-1, a_dim))), dict(prob=new_prob_var.reshape((-1, a_dim)))) return flat_ratios.reshape(old_prob_var.shape[:2]) def entropy(self, dist_info): probs = dist_info["prob"] return -np.sum(probs * np.log(probs + TINY), axis=2) def log_likelihood_sym(self, xs, dist_info_vars): probs = dist_info_vars["prob"] # Assume layout is N * T * A a_dim = probs.shape[-1] # a_dim = TT.printing.Print("lala")(a_dim) flat_logli = self._cat.log_likelihood_sym( xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim)))) return flat_logli.reshape(probs.shape[:2]) def log_likelihood(self, xs, dist_info): probs = dist_info["prob"] # Assume layout is N * T * A a_dim = probs.shape[-1] flat_logli = self._cat.log_likelihood_sym( xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim)))) return flat_logli.reshape(probs.shape[:2]) @property def dist_info_keys(self): return ["prob"]
def __init__(self): self._cat = Categorical()
def __init__( self, input_shape, output_dim, predict_all=False, # CF prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self.output_dim = output_dim self._optimizer = optimizer if prob_network is None: prob_network = GRUNetwork( input_shape=input_shape, output_dim=output_dim, hidden_dim=hidden_sizes[0], # this gives 32 by default hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_prob = prob_network.output_layer LasagnePowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = TT.itensor3("ys") old_prob_var = TT.tensor3("old_prob") x_mean_var = theano.shared( np.zeros( ( 1, 1, ) + input_shape ), # this syntax makes the shape (1,1,*input_shape,). The first is traj name="x_mean", broadcastable=( True, True, ) + (False, ) * len(input_shape)) x_std_var = theano.shared(np.ones(( 1, 1, ) + input_shape), name="x_std", broadcastable=( True, True, ) + (False, ) * len(input_shape)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var_all = L.get_output( l_prob, {prob_network.input_layer: normalized_xs_var}) if predict_all: prob_var = prob_var_all else: # take only last dim but keep the shape prob_var_last = TT.reshape( prob_var_all[:, -1, :], (TT.shape(prob_var_all)[0], 1, TT.shape(prob_var_all)[2])) # padd along the time dimension to obtain the same shape as before padded_prob_var = TT.tile(prob_var_last, (1, TT.shape(prob_var_all)[1], 1)) # give it the standard name prob_var = padded_prob_var old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars)) loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted_flat = special.to_onehot_sym( TT.flatten(TT.argmax(prob_var, axis=-1)), output_dim) predicted = TT.reshape(predicted_flat, TT.shape(prob_var)) self._f_predict = ext.compile_function([xs_var], predicted) self._f_prob = ext.compile_function([xs_var], prob_var) self._prob_network = prob_network self._l_prob = l_prob optimizer_args = dict( loss=loss, target=self, network_outputs=[prob_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var
def __init__( self, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=NL.rectify, optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, name=None, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer() else: optimizer = LbfgsOptimizer() self.output_dim = output_dim self._optimizer = optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_prob = prob_network.output_layer LasagnePowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = TT.imatrix("ys") old_prob_var = TT.matrix("old_prob") x_mean_var = theano.shared( np.zeros((1,) + input_shape), name="x_mean", broadcastable=(True,) + (False,) * len(input_shape) ) x_std_var = theano.shared( np.ones((1,) + input_shape), name="x_std", broadcastable=(True,) + (False,) * len(input_shape) ) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars)) loss = - TT.mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1), output_dim) self._f_predict = ext.compile_function([xs_var], predicted) self._f_prob = ext.compile_function([xs_var], prob_var) self._prob_network = prob_network self._l_prob = l_prob optimizer_args = dict( loss=loss, target=self, network_outputs=[prob_var], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._x_mean_var = x_mean_var self._x_std_var = x_std_var
def __init__(self, dim): self._cat = Categorical(dim) self._dim = dim