def __init__( self, env_spec, hardcoded_q=None, scope='policy', ent_wt=1.0, ): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ #self.graph = tf.get_default_graph() assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalSoftQPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim self.dist = Categorical(action_dim) self.ent_wt = ent_wt self.hardcoded_q = hardcoded_q with tf.variable_scope(scope) as vs: self.vs = vs self.q_func = tf.get_variable( 'q_func', shape=(obs_dim, action_dim)) self.q_func_plc = tf.placeholder( tf.float32, shape=(obs_dim, action_dim)) self.q_func_assgn = tf.assign(self.q_func, self.q_func_plc)
def __init__(self, *, name, policy_model, num_envs, env_spec, wrapped_env_action_space, action_space, observation_space, batching_config, init_location=None, encoder=None): Serializable.quick_init(self, locals()) assert isinstance(wrapped_env_action_space, Box) self._dist = Categorical(wrapped_env_action_space.shape[0]) # this is going to be serialized, so we can't add in the envs or # wrappers self.init_args = dict(name=name, policy_model=policy_model, init_location=init_location) ent_coef = 0.01 vf_coef = 0.5 max_grad_norm = 0.5 model_args = dict(policy=policy_model, ob_space=observation_space, ac_space=action_space, nbatch_act=batching_config.nenvs, nbatch_train=batching_config.nbatch_train, nsteps=batching_config.nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) self.num_envs = num_envs with tf.variable_scope(name) as scope: policy = policies.Policy(model_args) self.model = policy.model self.act_model = self.model.act_model self.scope = scope StochasticPolicy.__init__(self, env_spec) self.name = name self.probs = tf.nn.softmax(self.act_model.pd.logits) obs_var = self.act_model.X self.tensor_values = lambda **kwargs: tf.get_default_session().run( self.get_params()) self._f_dist = tensor_utils.compile_function(inputs=[obs_var], outputs=self.probs) if init_location: data = joblib.load(open(init_location, 'rb')) self.restore_from_snapshot(data['policy_params'])
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim with tf.variable_scope(name): if prob_network is None: prob_network = self.create_MLP( input_shape=(obs_dim, ), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, name="prob_network", ) self._l_obs, self._l_prob = self.forward_MLP( 'prob_network', prob_network, n_hidden=len(hidden_sizes), input_shape=(obs_dim, ), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, reuse=None) # if you want to input your own tensor. self._forward_out = lambda x, is_train: self.forward_MLP( 'prob_network', prob_network, n_hidden=len(hidden_sizes), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_tensor=x, is_training=is_train)[1] self._f_prob = tensor_utils.compile_function([self._l_obs], L.get_output( self._l_prob)) self._dist = Categorical(env_spec.action_space.n)
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, grad_step_size=1.0, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :param grad_step_size: the step size taken in the learner's gradient update, sample uniformly if it is a range e.g. [0.1,1] :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.n self.n_hidden = len(hidden_sizes) self.hidden_nonlinearity = hidden_nonlinearity self.input_shape = (None, obs_dim,) self.step_size = grad_step_size if prob_network is None: self.all_params = self.create_MLP( output_dim=self.action_dim, hidden_sizes=hidden_sizes, name="prob_network", ) self.all_param_vals = None self._l_obs, self._l_prob = self.forward_MLP('prob_network', self.all_params, n_hidden=len(hidden_sizes), input_shape=(obs_dim,), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, reuse=None) # if you want to input your own tensor. self._forward_out = lambda x, params, is_train: self.forward_MLP('prob_network', params, n_hidden=len(hidden_sizes), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, input_tensor=x, is_training=is_train)[1] self._init_f_prob = tensor_utils.compile_function( [self._l_obs], [self._l_prob]) self._cur_f_prob = self._init_f_prob self._dist = Categorical(self.action_dim) self._cached_params = {} super(MAMLCategoricalMLPPolicy, self).__init__(env_spec)
def init_policy(self): output_vec = L.get_output(self._output_vec_layer, deterministic=True) / self._c prob = tf.nn.softmax(output_vec) max_qval = tf.reduce_logsumexp(output_vec, [1]) self._f_prob = tensor_utils.compile_function( [self._obs_layer.input_var], prob) self._f_max_qvals = tensor_utils.compile_function( [self._obs_layer.input_var], max_qval) self._dist = Categorical(self._n)
def __init__( self, name, env_spec, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_sizes=[], hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.softmax, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) self._env_spec = env_spec # import pdb; pdb.set_trace() if prob_network is None: prob_network = ConvNetwork( input_shape=env_spec.observation_space.shape, output_dim=env_spec.action_space.n, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(CategoricalConvPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def __init__(self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, gating_network=None, input_layer=None, num_options=4, conv_filters=None, conv_filter_sizes=None, conv_strides=None, conv_pads=None, input_shape=None): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) self.num_options = num_options assert isinstance(env_spec.action_space, Discrete) with tf.variable_scope(name): input_layer, output_layer = self.make_network( (env_spec.observation_space.flat_dim, ), env_spec.action_space.n, hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, gating_network=gating_network, l_in=input_layer, conv_filters=conv_filters, conv_filter_sizes=conv_filter_sizes, conv_strides=conv_strides, conv_pads=conv_pads, input_shape=input_shape) self._l_prob = output_layer self._l_obs = input_layer self._f_prob = tensor_utils.compile_function( [input_layer.input_var], L.get_output(output_layer)) self._dist = Categorical(env_spec.action_space.n) super(CategoricalDecomposedPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [output_layer])
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) with tf.variable_scope(name): if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim,), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer) ) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def __init__(self, dim): self._cat = Categorical(dim) self._dim = dim
class RecurrentCategorical(Distribution): def __init__(self, dim): self._cat = Categorical(dim) self._dim = dim @property def dim(self): return self._dim def kl_sym(self, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic KL divergence of two categorical distributions """ old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A return tf.reduce_sum( old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), reduction_indices=2 ) def kl(self, old_dist_info, new_dist_info): """ Compute the KL divergence of two categorical distributions """ old_prob = old_dist_info["prob"] new_prob = new_dist_info["prob"] return np.sum( old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), axis=2 ) def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A a_dim = tf.shape(x_var)[2] flat_ratios = self._cat.likelihood_ratio_sym( tf.reshape(x_var, tf.pack([-1, a_dim])), dict(prob=tf.reshape(old_prob_var, tf.pack([-1, a_dim]))), dict(prob=tf.reshape(new_prob_var, tf.pack([-1, a_dim]))) ) return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2]) def entropy(self, dist_info): probs = dist_info["prob"] return -np.sum(probs * np.log(probs + TINY), axis=2) def entropy_sym(self, dist_info_vars): probs = dist_info_vars["prob"] return -tf.reduce_sum(probs * tf.log(probs + TINY), 2) def log_likelihood_sym(self, xs, dist_info_vars): probs = dist_info_vars["prob"] # Assume layout is N * T * A a_dim = tf.shape(probs)[2] # a_dim = TT.printing.Print("lala")(a_dim) flat_logli = self._cat.log_likelihood_sym( tf.reshape(xs, tf.pack([-1, a_dim])), dict(prob=tf.reshape(probs, tf.pack((-1, a_dim)))) ) return tf.reshape(flat_logli, tf.shape(probs)[:2]) def log_likelihood(self, xs, dist_info): probs = dist_info["prob"] # Assume layout is N * T * A a_dim = tf.shape(probs)[2] flat_logli = self._cat.log_likelihood_sym( xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim))) ) return flat_logli.reshape(probs.shape[:2]) @property def dist_info_specs(self): return [("prob", (self.dim,))]
def __init__( self, name, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, tr_optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, no_initial_trust_region=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") if tr_optimizer is None: tr_optimizer = ConjugateGradientOptimizer() self.input_dim = input_shape[0] self.observation_space = Discrete(self.input_dim) self.action_space = Discrete(output_dim) self.output_dim = output_dim self.optimizer = optimizer self.tr_optimizer = tr_optimizer if prob_network is None: prob_network = MLP( input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network" ) l_prob = prob_network.output_layer LayersPowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob") x_mean_var = tf.get_variable( name="x_mean", shape=(1,) + input_shape, initializer=tf.constant_initializer(0., dtype=tf.float32) ) x_std_var = tf.get_variable( name="x_std", shape=(1,) + input_shape, initializer=tf.constant_initializer(1., dtype=tf.float32) ) self.x_mean_var = x_mean_var self.x_std_var = x_std_var normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output(l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) loss = - tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = tensor_utils.to_onehot_sym(tf.argmax(prob_var, axis=1), output_dim) self.prob_network = prob_network self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_prob = tensor_utils.compile_function([xs_var], prob_var) self.l_prob = l_prob self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var]) self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var, old_prob_var], leq_constraint=(mean_kl, step_size) ) self.use_trust_region = use_trust_region self.name = name self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.first_optimized = not no_initial_trust_region
class RecurrentCategorical(Distribution): def __init__(self, dim): self._cat = Categorical(dim) self._dim = dim @property def dim(self): return self._dim def kl_sym(self, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic KL divergence of two categorical distributions """ old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A return tf.reduce_sum( old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), reduction_indices=2) def kl(self, old_dist_info, new_dist_info): """ Compute the KL divergence of two categorical distributions """ old_prob = old_dist_info["prob"] new_prob = new_dist_info["prob"] return np.sum(old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), axis=2) def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * T * A a_dim = tf.shape(x_var)[2] flat_ratios = self._cat.likelihood_ratio_sym( tf.reshape(x_var, tf.stack([-1, a_dim])), dict(prob=tf.reshape(old_prob_var, tf.stack([-1, a_dim]))), dict(prob=tf.reshape(new_prob_var, tf.stack([-1, a_dim])))) return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2]) def entropy(self, dist_info): probs = dist_info["prob"] return -np.sum(probs * np.log(probs + TINY), axis=2) def entropy_sym(self, dist_info_vars): probs = dist_info_vars["prob"] return -tf.reduce_sum(probs * tf.log(probs + TINY), 2) def log_likelihood_sym(self, xs, dist_info_vars): probs = dist_info_vars["prob"] # Assume layout is N * T * A a_dim = tf.shape(probs)[2] # a_dim = TT.printing.Print("lala")(a_dim) flat_logli = self._cat.log_likelihood_sym( tf.reshape(xs, tf.stack([-1, a_dim])), dict(prob=tf.reshape(probs, tf.stack((-1, a_dim))))) return tf.reshape(flat_logli, tf.shape(probs)[:2]) def log_likelihood(self, xs, dist_info): probs = dist_info["prob"] # Assume layout is N * T * A a_dim = tf.shape(probs)[2] flat_logli = self._cat.log_likelihood_sym( xs.reshape((-1, a_dim)), dict(prob=probs.reshape((-1, a_dim)))) return flat_logli.reshape(probs.shape[:2]) @property def dist_info_specs(self): return [("prob", (self.dim, ))]