def update_opt(self, loss, target, inputs, extra_inputs=None, *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab_maml.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target def get_opt_output(): flat_grad = tensor_utils.flatten_tensor_variables( tf.gradients(loss, target.get_params(trainable=True))) return [tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64)] if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), f_opt=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=get_opt_output(), ))
def update_opt(self, loss, target, inputs, inner_kl, outer_kl, extra_inputs=None, meta_batch_size=1, num_grad_updates=1, **kwargs): """ :param inner_kl: Symbolic expression for inner kl :param outer_kl: Symbolic expression for outer kl :param meta_batch_size: number of MAML tasks, for batcher """ super().update_opt(loss, target, inputs, extra_inputs, **kwargs) if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function(inputs + extra_inputs, loss), f_inner_kl=lambda: tensor_utils.compile_function(inputs + extra_inputs, inner_kl), f_outer_kl=lambda: tensor_utils.compile_function(inputs + extra_inputs, outer_kl), ) if self.multi_adam > 1: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: # for batch norm updates = tf.group(*update_ops) with tf.control_dependencies([updates]): self._train_ops = [optimizer.minimize(loss, var_list=target.get_params(trainable=True)) for optimizer in self._tf_optimizers] else: self._train_ops = [optimizer.minimize(loss, var_list=target.get_params(trainable=True)) for optimizer in self._tf_optimizers] self.meta_batch_size = meta_batch_size self.num_grad_updates = num_grad_updates
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = tf.gradients(f, xs=params) for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) xs = tuple([ tensor_utils.new_tensor_like(p.name.split(":")[0], p) for p in params ]) def Hx_plain(): Hx_plain_splits = tf.gradients( tf.reduce_sum( tf.stack([ tf.reduce_sum(g * x) for g, x in zip(constraint_grads, xs) ])), params) for idx, (Hx, param) in enumerate(zip(Hx_plain_splits, params)): if Hx is None: Hx_plain_splits[idx] = tf.zeros_like(param) return tensor_utils.flatten_tensor_variables(Hx_plain_splits) self.opt_fun = ext.lazydict( f_Hx_plain=lambda: tensor_utils.compile_function( inputs=inputs + xs, outputs=Hx_plain(), log_name="f_Hx_plain", ), )
def __init__(self, dim): self._dim = dim weights_var = tf.placeholder(dtype=tf.float32, shape=(None, dim), name="weights") self._f_sample = tensor_utils.compile_function( inputs=[weights_var], outputs=tf.multinomial(weights_var, num_samples=1)[:, 0], )
def update_opt(self, loss, target, leq_constraint, inputs, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab_maml.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ constraint_term, constraint_value = leq_constraint with tf.variable_scope(self._name): penalty_var = tf.placeholder(tf.float32, tuple(), name="penalty") penalized_loss = loss + penalty_var * constraint_term self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name def get_opt_output(): params = target.get_params(trainable=True) grads = tf.gradients(penalized_loss, params) for idx, (grad, param) in enumerate(zip(grads, params)): if grad is None: grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(grads) return [ tf.cast(penalized_loss, tf.float64), tf.cast(flat_grad, tf.float64), ] self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function(inputs, loss, log_name="f_loss"), f_constraint=lambda: tensor_utils.compile_function(inputs, constraint_term, log_name="f_constraint"), f_penalized_loss=lambda: tensor_utils.compile_function( inputs=inputs + [penalty_var], outputs=[penalized_loss, loss, constraint_term], log_name="f_penalized_loss", ), f_opt=lambda: tensor_utils.compile_function( inputs=inputs + [penalty_var], outputs=get_opt_output(), ) )
def update_opt(self, loss, target, inputs, kl, extra_inputs=None, **kwargs): """ :param inner_kl: Symbolic expression for inner kl :param outer_kl: Symbolic expression for outer kl :param meta_batch_size: number of MAML tasks, for batcher """ super(PPOOptimizer, self).update_opt(loss, target, inputs, extra_inputs, **kwargs) if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), f_kl=lambda: tensor_utils.compile_function(inputs + extra_inputs, kl), )
def update_opt(self, loss, target, inputs, inner_kl, extra_inputs=None, meta_batch_size=1, num_grad_updates=1, **kwargs): """ :param inner_kl: Symbolic expression for inner kl :param meta_batch_size: number of MAML tasks, for batcher """ super().update_opt(loss, target, inputs, extra_inputs, **kwargs) if extra_inputs is None: extra_inputs = list() self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), f_inner_kl=lambda: tensor_utils.compile_function( inputs + extra_inputs, inner_kl)) self.meta_batch_size = meta_batch_size self.num_grad_updates = num_grad_updates
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim with tf.variable_scope(name): if prob_network is None: prob_network = self.create_MLP( input_shape=(obs_dim,), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, name="prob_network", ) self._l_obs, self._l_prob = self.forward_MLP('prob_network', prob_network, n_hidden=len(hidden_sizes), input_shape=(obs_dim,), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, reuse=None) # if you want to input your own tensor. self._forward_out = lambda x, is_train: self.forward_MLP('prob_network', prob_network, n_hidden=len(hidden_sizes), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_tensor=x, is_training=is_train)[1] self._f_prob = tensor_utils.compile_function( [self._l_obs], L.get_output(self._l_prob) ) self._dist = Categorical(env_spec.action_space.n)
def update_opt(self, f, target, inputs, reg_coeff): self.target = target self.reg_coeff = reg_coeff params = target.get_params(trainable=True) constraint_grads = tf.gradients(f, xs=params) for idx, (grad, param) in enumerate(zip(constraint_grads, params)): if grad is None: constraint_grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(constraint_grads) def f_Hx_plain( *args): #receives inputs and xs(flattened inputs) as arguments inputs_ = args[:len(inputs)] xs = args[len(inputs):] flat_xs = np.concatenate([np.reshape(x, (-1, )) for x in xs]) param_val = self.target.get_param_values(trainable=True) eps = np.cast['float32'](self.base_eps / (np.linalg.norm(param_val) + 1e-8)) self.target.set_param_values(param_val + eps * flat_xs, trainable=True) flat_grad_dvplus = self.opt_fun["f_grad"](*inputs_) self.target.set_param_values(param_val, trainable=True) if self.symmetric: self.target.set_param_values(param_val - eps * flat_xs, trainable=True) flat_grad_dvminus = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad_dvminus) / (2 * eps) self.target.set_param_values(param_val, trainable=True) else: flat_grad = self.opt_fun["f_grad"](*inputs_) hx = (flat_grad_dvplus - flat_grad) / eps return hx self.opt_fun = ext.lazydict( f_grad=lambda: tensor_utils.compile_function( inputs=inputs, outputs=flat_grad, log_name="f_grad", ), f_Hx_plain=lambda: f_Hx_plain, )
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) with tf.variable_scope(name): if prob_network is None: prob_network = MLP( input_shape=(env_spec.observation_space.flat_dim, ), output_dim=env_spec.action_space.n, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network", ) self._l_prob = prob_network.output_layer self._l_obs = prob_network.input_layer self._f_prob = tensor_utils.compile_function( [prob_network.input_layer.input_var], L.get_output(prob_network.output_layer)) self._dist = Categorical(env_spec.action_space.n) super(CategoricalMLPPolicy, self).__init__(env_spec) LayersPowered.__init__(self, [prob_network.output_layer])
def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): # Initializes the update opt used in the optimization """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab_maml.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs :return: No return value. """ self._target = target update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: # for batch norm updates = tf.group(*update_ops) with tf.control_dependencies([updates]): self._train_op = self._tf_optimizer.minimize( loss, var_list=target.get_params(trainable=True)) if self._init_tf_optimizer is not None: self._init_train_op = self._init_tf_optimizer.minimize( loss, var_list=target.get_params(trainable=True)) else: self._train_op = self._tf_optimizer.minimize( loss, var_list=target.get_params(trainable=True)) if self._init_tf_optimizer is not None: self._init_train_op = self._init_tf_optimizer.minimize( loss, var_list=target.get_params(trainable=True)) if extra_inputs is None: extra_inputs = list() self._input_vars = inputs + extra_inputs self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function( inputs + extra_inputs, loss), ) self.debug_loss = loss self.debug_vars = target.get_params(trainable=True) self.debug_target = target
def __init__(self, name, input_shape, output_dim, mean_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, use_trust_region=True, step_size=0.01, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_nonlinearity=None, normalize_inputs=True, normalize_outputs=True, subsample_factor=1.0): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned. :param adaptive_std: Whether to make the std a function of the states. :param std_share_network: Whether to use the same network as the mean. :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if `std_share_network` is False. It defaults to the same architecture as the mean. :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network` is False. It defaults to the same non-linearity as the mean. """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: if use_trust_region: optimizer = PenaltyLbfgsOptimizer("optimizer") else: optimizer = LbfgsOptimizer("optimizer") self._optimizer = optimizer self._subsample_factor = subsample_factor if mean_network is None: mean_network = MLP( name="mean_network", input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=None, ) l_mean = mean_network.output_layer if adaptive_std: l_log_std = MLP( name="log_std_network", input_shape=input_shape, input_var=mean_network.input_layer.input_var, output_dim=output_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_nonlinearity, output_nonlinearity=None, ).output_layer else: l_log_std = L.ParamLayer( mean_network.input_layer, num_units=output_dim, param=tf.constant_initializer(np.log(init_std)), name="output_log_std", trainable=learn_std, ) LayersPowered.__init__(self, [l_mean, l_log_std]) xs_var = mean_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim)) old_means_var = tf.placeholder(dtype=tf.float32, name="ys", shape=(None, output_dim)) old_log_stds_var = tf.placeholder(dtype=tf.float32, name="old_log_stds", shape=(None, output_dim)) x_mean_var = tf.Variable( np.zeros((1, ) + input_shape, dtype=np.float32), name="x_mean", ) x_std_var = tf.Variable( np.ones((1, ) + input_shape, dtype=np.float32), name="x_std", ) y_mean_var = tf.Variable( np.zeros((1, output_dim), dtype=np.float32), name="y_mean", ) y_std_var = tf.Variable( np.ones((1, output_dim), dtype=np.float32), name="y_std", ) normalized_xs_var = (xs_var - x_mean_var) / x_std_var normalized_ys_var = (ys_var - y_mean_var) / y_std_var normalized_means_var = L.get_output( l_mean, {mean_network.input_layer: normalized_xs_var}) normalized_log_stds_var = L.get_output( l_log_std, {mean_network.input_layer: normalized_xs_var}) means_var = normalized_means_var * y_std_var + y_mean_var log_stds_var = normalized_log_stds_var + tf.log(y_std_var) normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var normalized_old_log_stds_var = old_log_stds_var - tf.log(y_std_var) dist = self._dist = DiagonalGaussian(output_dim) normalized_dist_info_vars = dict(mean=normalized_means_var, log_std=normalized_log_stds_var) mean_kl = tf.reduce_mean( dist.kl_sym( dict(mean=normalized_old_means_var, log_std=normalized_old_log_stds_var), normalized_dist_info_vars, )) loss = -tf.reduce_mean( dist.log_likelihood_sym(normalized_ys_var, normalized_dist_info_vars)) self._f_predict = tensor_utils.compile_function([xs_var], means_var) self._f_pdists = tensor_utils.compile_function( [xs_var], [means_var, log_stds_var]) self._l_mean = l_mean self._l_log_std = l_log_std optimizer_args = dict( loss=loss, target=self, network_outputs=[ normalized_means_var, normalized_log_stds_var ], ) if use_trust_region: optimizer_args["leq_constraint"] = (mean_kl, step_size) optimizer_args["inputs"] = [ xs_var, ys_var, old_means_var, old_log_stds_var ] else: optimizer_args["inputs"] = [xs_var, ys_var] self._optimizer.update_opt(**optimizer_args) self._use_trust_region = use_trust_region self._name = name self._normalize_inputs = normalize_inputs self._normalize_outputs = normalize_outputs self._mean_network = mean_network self._x_mean_var = x_mean_var self._x_std_var = x_std_var self._y_mean_var = y_mean_var self._y_std_var = y_std_var
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, bias_transform=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.identity, mean_network=None, std_network=None, std_parametrization='exp', ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: boolean indicating whether std shall be a trainable variable :param bias_transform: boolean indicating whether bias transformation shall be added to the MLP :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :param grad_step_size: (float) the step size taken in the learner's gradient update :param trainable_step_size: boolean indicating whether the inner grad_step_size shall be trainable :param stop_grad: whether or not to stop the gradient through the gradient. :param: parameter_space_noise: (boolean) whether parameter space noise shall be used when sampling from the policy """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) or isinstance( env_spec.action_space, BoxMAML) obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.flat_dim self.n_hidden = len(hidden_sizes) self.hidden_nonlinearity = hidden_nonlinearity self.output_nonlinearity = output_nonlinearity self.input_shape = ( None, obs_dim, ) self.name = name with tf.variable_scope(self.name): # create network if mean_network is None: self.all_params = create_MLP( # TODO: this should not be a method of the policy! --> helper name="mean_network", input_shape=self.input_shape, output_dim=self.action_dim, hidden_sizes=hidden_sizes, bias_transform=bias_transform, ) self.input_tensor, _ = forward_MLP( 'mean_network', self.input_shape, self.n_hidden, self.hidden_nonlinearity, self.output_nonlinearity, self.all_params, reuse=None, # Need to run this for batch norm bias_transform=bias_transform, ) forward_mean = lambda x, params, is_train: forward_MLP( 'mean_network', self.input_shape, self.n_hidden, self.hidden_nonlinearity, self.output_nonlinearity, params, input_tensor=x, is_training=is_train, bias_transform=bias_transform)[1] else: raise NotImplementedError('Not supported.') if std_network is not None: raise NotImplementedError('Not supported.') else: if adaptive_std: raise NotImplementedError('Not supported.') else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError self.all_params['std_param'] = make_param_layer( num_units=self.action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) forward_std = lambda x, params: forward_param_layer( x, params['std_param']) # unify forward mean and forward std into a single function self._forward = lambda obs, params, is_train: (forward_mean( obs, params, is_train), forward_std(obs, params)) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param self._dist = DiagonalGaussian(self.action_dim) self._cached_params = {} super(BaseMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym(self.input_tensor, dict(), is_training=False) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] # pre-update policy self._cur_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor], outputs=[mean_var, log_std_var], )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( name='advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -tf.reduce_sum( logli * advantage_var * valid_var) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) max_kl = tf.reduce_max(kl * valid_var) else: surr_obj = -tf.reduce_mean(logli * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list if is_recurrent: input_list.append(valid_var) #self.policy.set_init_surr_obj(input_list, [surr_obj]) # debugging self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def compute_updated_dists(self, samples): """ Compute fast gradients once per iteration and pull them out of tensorflow for sampling with the post-update policy. """ start = time.time() num_tasks = len(samples) param_keys = self.all_params.keys() update_param_keys = param_keys no_update_param_keys = [] sess = tf.get_default_session() obs_list, action_list, adv_list = [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) inputs = obs_list + action_list + adv_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. init_param_values = None if self.all_param_vals is not None: # skip this in first iteration init_param_values = self.get_variable_values(self.all_params) step_size = self.step_size for i in range(num_tasks): if self.all_param_vals is not None: # skip this in first iteration self.assign_params(self.all_params, self.all_param_vals[i]) if 'all_fast_params_tensor' not in dir( self): # only enter if first iteration # make computation graph once self.all_fast_params_tensor = [] # compute gradients for a current task (symbolic) for i in range(num_tasks): # compute gradients for a current task (symbolic) gradients = dict( zip( update_param_keys, tf.gradients(self.surr_objs[i], [ self.all_params[key] for key in update_param_keys ]))) # gradient update for params of current task (symbolic) fast_params_tensor = OrderedDict( zip(update_param_keys, [ self.all_params[key] - step_size * gradients[key] for key in update_param_keys ])) # undo gradient update for no_update_params (symbolic) for k in no_update_param_keys: fast_params_tensor[k] = self.all_params[k] # tensors that represent the updated params for all of the tasks (symbolic) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once ## first is the vars, second the values # these are the updated values of the params after the gradient step self.all_param_vals = sess.run( self.all_fast_params_tensor, feed_dict=dict(list(zip(self.input_list_for_grad, inputs)))) # reset parameters to original ones if init_param_values is not None: # skip this in first iteration self.assign_params(self.all_params, init_param_values) # compile the _cur_f_dist with updated params outputs = [] inputs = tf.split(self.input_tensor, num_tasks, 0) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_vals[i], is_training=False) outputs.append([info['mean'], info['log_std']]) self._cur_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor], outputs=outputs, ) total_time = time.time() - start logger.record_tabular("ComputeUpdatedDistTime", total_time)
def __init__(self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, mean_network=None, std_network=None, std_parametrization='exp'): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) with tf.variable_scope(name): obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( name="mean_network", input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_std_param = std_network.output_layer else: if adaptive_std: std_network = MLP( name="std_network", input_shape=(obs_dim, ), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_std_param = std_network.output_layer else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError l_std_param = L.ParamLayer( mean_network.input_layer, num_units=action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param # mean_var, log_std_var = L.get_output([l_mean, l_std_param]) # # if self.min_std_param is not None: # log_std_var = tf.maximum(log_std_var, np.log(min_std)) # # self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_std_param = l_std_param self._dist = DiagonalGaussian(action_dim) LayersPowered.__init__(self, [l_mean, l_std_param]) super(GaussianMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym( mean_network.input_layer.input_var, dict()) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] self._f_dist = tensor_utils.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], )
def __init__( self, name, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, prob_network=None, grad_step_size=1.0, ): """ :param env_spec: A spec for the mdp. :param hidden_sizes: list of sizes for the fully connected hidden layers :param hidden_nonlinearity: nonlinearity used for each hidden layer :param prob_network: manually specified network for this policy, other network params are ignored :param grad_step_size: the step size taken in the learner's gradient update, sample uniformly if it is a range e.g. [0.1,1] :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Discrete) obs_dim = env_spec.observation_space.flat_dim self.action_dim = env_spec.action_space.n self.n_hidden = len(hidden_sizes) self.hidden_nonlinearity = hidden_nonlinearity self.input_shape = ( None, obs_dim, ) self.step_size = grad_step_size if prob_network is None: self.all_params = self.create_MLP( output_dim=self.action_dim, hidden_sizes=hidden_sizes, name="prob_network", ) self._l_obs, self._l_prob = self.forward_MLP( 'prob_network', self.all_params, n_hidden=len(hidden_sizes), input_shape=(obs_dim, ), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, reuse=None) # if you want to input your own tensor. self._forward_out = lambda x, params, is_train: self.forward_MLP( 'prob_network', params, n_hidden=len(hidden_sizes), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, input_tensor=x, is_training=is_train)[1] self._init_f_prob = tensor_utils.compile_function([self._l_obs], [self._l_prob]) self._cur_f_prob = self._init_f_prob self._dist = Categorical(self.action_dim) self._cached_params = {} super(MAMLCategoricalMLPPolicy, self).__init__(env_spec)
def compute_updated_dists(self, samples): """ Compute fast gradients once per iteration and pull them out of tensorflow for sampling with the post-update policy. """ start = time.time() num_tasks = len(samples) param_keys = self.all_params.keys() update_param_keys = param_keys no_update_param_keys = [] sess = tf.get_default_session() obs_list, action_list, adv_list, distr_list = [], [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages', 'agent_infos') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) distr_list.extend(inputs[3][k] for k in self.distribution.dist_info_keys) inputs = obs_list + action_list + adv_list + distr_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. if self.first_inner_step: # skip this in first iteration self.init_param_values = self.get_variable_values(self.all_params) self.all_param_vals = [self.get_variable_values(self.all_params) for _ in range(num_tasks)] if self.params_ph is None: self.params_ph = [OrderedDict([(key, tf.placeholder(tf.float32, shape=value.shape)) for key, value in self.all_params.items()]) for _ in range(num_tasks)] if 'all_fast_params_tensor' not in dir(self): # only enter if first iteration # make computation graph once self.all_fast_params_tensor = [] # compute gradients for a current task (symbolic) for i in range(num_tasks): # compute gradients for a current task (symbolic) for key in self.all_params.keys(): tf.assign(self.all_params[key], self.params_ph[i][key]) gradients = dict(zip(update_param_keys, tf.gradients(self.surr_objs[i], [self.all_params[key] for key in update_param_keys]))) # gradient update for params of current task (symbolic) fast_params_tensor = OrderedDict(zip(update_param_keys, [self.all_params[key] - tf.multiply( self.param_step_sizes[key + "_step_size"], gradients[key]) for key in update_param_keys])) # add step sizes to fast_params_tensor fast_params_tensor.update(self.param_step_sizes) # undo gradient update for no_update_params (symbolic) for k in no_update_param_keys: fast_params_tensor[k] = self.all_params[k] # tensors that represent the updated params for all of the tasks (symbolic) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once ## first is the vars, second the values # these are the updated values of the params after the gradient step feed_dict = list(zip(self.input_list_for_grad, inputs)) feed_dict_params = list((self.params_ph[task][key], self.all_param_vals[task][key]) for task in range(num_tasks) for key in self.params_ph[0].keys()) feed_dict = dict(feed_dict + feed_dict_params) self.all_param_vals = sess.run(self.all_fast_params_tensor, feed_dict=feed_dict) if self.all_param_ph is None: self.all_param_ph = [OrderedDict([(key, tf.placeholder(tf.float32, shape=value.shape)) for key, value in self.all_param_vals[0].items()]) for _ in range(num_tasks)] # reset parameters to original ones self.assign_params(self.all_params, self.init_param_values) # compile the _cur_f_dist with updated params if not self.compiled: outputs = [] with tf.variable_scope("post_updated_policy"): inputs = tf.split(self.input_tensor, num_tasks, 0) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_ph[i], is_training=False) outputs.append([info['mean'], info['log_std']]) self.__cur_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor, self.param_noise_std_ph] + sum([list(param_ph.values()) for param_ph in self.all_param_ph], []), outputs=outputs, ) self.compiled = True self._cur_f_dist = self.__cur_f_dist self.first_inner_step = False
def compute_updated_dists(self, samples): """ Compute fast gradients once and pull them out of tensorflow for sampling. """ num_tasks = len(samples) param_keys = self.all_params.keys() sess = tf.get_default_session() obs_list, action_list, adv_list = [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) inputs = obs_list + action_list + adv_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. init_param_values = None if self.all_param_vals is not None: init_param_values = self.get_variable_values(self.all_params) step_size = self.step_size for i in range(num_tasks): if self.all_param_vals is not None: self.assign_params(self.all_params, self.all_param_vals[i]) if 'all_fast_params_tensor' not in dir(self): # make computation graph once self.all_fast_params_tensor = [] for i in range(num_tasks): gradients = dict( zip( param_keys, tf.gradients( self.surr_objs[i], [self.all_params[key] for key in param_keys]))) fast_params_tensor = dict( zip(param_keys, [ self.all_params[key] - step_size * gradients[key] for key in param_keys ])) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once self.all_param_vals = sess.run( self.all_fast_params_tensor, feed_dict=dict(list(zip(self.input_list_for_grad, inputs)))) if init_param_values is not None: self.assign_params(self.all_params, init_param_values) outputs = [] inputs = tf.split(0, num_tasks, self._l_obs) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_vals[i], is_training=False) outputs.append([info['prob']]) self._cur_f_prob = tensor_utils.compile_function( inputs=[self._l_obs], outputs=outputs, )
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.identity, mean_network=None, std_network=None, std_parametrization='exp' ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :return: """ Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: self.mean_params = mean_params = self.create_MLP( name="mean_network", input_shape=(None, obs_dim,), output_dim=action_dim, hidden_sizes=hidden_sizes, ) input_tensor, mean_tensor = self.forward_MLP('mean_network', mean_params, n_hidden=len(hidden_sizes), input_shape=(obs_dim,), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, reuse=None # Needed for batch norm ) # if you want to input your own thing. self._forward_mean = lambda x, is_train: self.forward_MLP('mean_network', mean_params, n_hidden=len(hidden_sizes), hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, input_tensor=x, is_training=is_train)[1] else: raise NotImplementedError('Chelsea does not support this.') if std_network is not None: raise NotImplementedError('Minimal Gaussian MLP does not support this.') else: if adaptive_std: # NOTE - this branch isn't tested raise NotImplementedError('Minimal Gaussian MLP doesnt have a tested version of this.') self.std_params = std_params = self.create_MLP( name="std_network", input_shape=(None, obs_dim,), output_dim=action_dim, hidden_sizes=std_hidden_sizes, ) # if you want to input your own thing. self._forward_std = lambda x: self.forward_MLP('std_network', std_params, n_hidden=len(hidden_sizes), hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=tf.identity, input_tensor=x)[1] else: if std_parametrization == 'exp': init_std_param = np.log(init_std) elif std_parametrization == 'softplus': init_std_param = np.log(np.exp(init_std) - 1) else: raise NotImplementedError self.std_params = make_param_layer( num_units=action_dim, param=tf.constant_initializer(init_std_param), name="output_std_param", trainable=learn_std, ) self._forward_std = lambda x: forward_param_layer(x, self.std_params) self.std_parametrization = std_parametrization if std_parametrization == 'exp': min_std_param = np.log(min_std) elif std_parametrization == 'softplus': min_std_param = np.log(np.exp(min_std) - 1) else: raise NotImplementedError self.min_std_param = min_std_param self._dist = DiagonalGaussian(action_dim) self._cached_params = {} super(GaussianMLPPolicy, self).__init__(env_spec) dist_info_sym = self.dist_info_sym(input_tensor, dict(), is_training=False) mean_var = dist_info_sym["mean"] log_std_var = dist_info_sym["log_std"] self._f_dist = tensor_utils.compile_function( inputs=[input_tensor], outputs=[mean_var, log_std_var], )
def __init__( self, name, env_spec, hidden_sizes=(32, 32), learn_std=True, num_tasks=1, init_std=1.0, adaptive_std=False, bias_transform=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, std_hidden_nonlinearity=tf.nn.tanh, hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=tf.identity, mean_network=None, std_network=None, std_parametrization='exp', grad_step_size=0.1, trainable_step_size=False, stop_grad=False, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: boolean indicating whether std shall be a trainable variable :param bias_transform: boolean indicating whether bias transformation shall be added to the MLP :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :param std_parametrization: how the std should be parametrized. There are a few options: - exp: the logarithm of the std will be stored, and applied a exponential transformation - softplus: the std will be computed as log(1+exp(x)) :param grad_step_size: (float) the step size taken in the learner's gradient update :param trainable_step_size: boolean indicating whether the inner grad_step_size shall be trainable :param stop_grad: whether or not to stop the gradient through the gradient. :param: parameter_space_noise: (boolean) whether parameter space noise shall be used when sampling from the policy """ Serializable.quick_init(self, locals()) BaseMLPPolicy.__init__( self, name, env_spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std, adaptive_std=adaptive_std, bias_transform=bias_transform, std_share_network=std_share_network, std_hidden_sizes=std_hidden_sizes, min_std=min_std, std_hidden_nonlinearity=std_hidden_nonlinearity, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, mean_network=mean_network, std_network=std_network, ) self.stop_grad = stop_grad self.num_tasks = num_tasks self.all_fast_params_tensor = [] self._all_param_gradients = [] self.all_param_vals = None #[self.get_variable_values(self.all_params)] * num_tasks self.init_param_vals = None self.param_step_sizes = {} self.grad_step_size = grad_step_size self.trainable_step_size = trainable_step_size self._update_input_keys = ['observations', 'actions', 'advantages'] with tf.variable_scope(self.name): # Create placeholders for the param weights of the different tasks self.all_params_ph = [ OrderedDict([(key, tf.placeholder(tf.float32, shape=value.shape)) for key, value in self.all_params.items()]) for _ in range(num_tasks) ] # Create the variables for the inner learning rate for key, param in self.all_params.items(): shape = param.get_shape().as_list() init_stepsize = np.ones(shape, dtype=np.float32) * self.grad_step_size self.param_step_sizes[key + "_step_size"] = tf.Variable( initial_value=init_stepsize, name='step_size_%s' % key, dtype=tf.float32, trainable=self.trainable_step_size) # compile the _cur_f_dist with updated params outputs = [] with tf.variable_scope("post_updated_policy"): inputs = tf.split(self.input_tensor, self.num_tasks, 0) for i in range(self.num_tasks): task_inp = inputs[i] dist_info, _ = self.dist_info_sym( task_inp, dict(), all_params=self.all_params_ph[i], is_training=False) outputs.append([dist_info['mean'], dist_info['log_std']]) # TODO: Set a different name for this _cur_f_dist, so you can obtain actions w/o needing the params if # TODO: you aren't using the get_actions_batch (it'll be needed at test time when you are just evaluating # TODO: in one task) self._batch_cur_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor] + sum([ list(param_task_ph.values()) for param_task_ph in self.all_params_ph ], []), # All the parameter values of the policy outputs=outputs, )
def __init__( self, name, input_shape, output_dim, network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, output_nonlinearity=None, optimizer=None, normalize_inputs=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") self.output_dim = output_dim self.optimizer = optimizer if network is None: network = MLP(input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="network") l_out = network.output_layer LayersPowered.__init__(self, [l_out]) xs_var = network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") x_mean_var = tf.get_variable(name="x_mean", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 0., dtype=tf.float32)) x_std_var = tf.get_variable(name="x_std", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 1., dtype=tf.float32)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var fit_ys_var = L.get_output(l_out, {network.input_layer: normalized_xs_var}) loss = -tf.reduce_mean(tf.square(fit_ys_var - ys_var)) self.f_predict = tensor_utils.compile_function([xs_var], fit_ys_var) optimizer_args = dict( loss=loss, target=self, network_outputs=[fit_ys_var], ) optimizer_args["inputs"] = [xs_var, ys_var] self.optimizer.update_opt(**optimizer_args) self.name = name self.l_out = l_out self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var
def init_opt(self): # TODO Commented out all KL stuff for now, since it is only used for logging # To see how it can be turned on, see maml_npo.py is_recurrent = int(self.policy.recurrent) assert not is_recurrent # not supported right now. dist = self.policy.distribution old_dist_info_vars, old_dist_info_vars_list = [], [] for i in range(self.meta_batch_size): old_dist_info_vars.append({ k: tf.placeholder(tf.float32, shape=[None] + list(shape), name='old_%s_%s' % (i, k)) for k, shape in dist.dist_info_specs }) old_dist_info_vars_list += [ old_dist_info_vars[i][k] for k in dist.dist_info_keys ] state_info_vars, state_info_vars_list = {}, [] all_surr_objs, input_list = [], [] new_params = None for j in range(self.num_grad_updates): obs_vars, action_vars, adv_vars = self.make_vars(str(j)) surr_objs = [] cur_params = new_params new_params = [] for i in range(self.meta_batch_size): if j == 0: dist_info_vars, params = self.policy.dist_info_sym( obs_vars[i], state_info_vars, all_params=self.policy.all_params) else: dist_info_vars, params = self.policy.updated_dist_info_sym( i, all_surr_objs[-1][i], obs_vars[i], params_dict=cur_params[i]) new_params.append(params) logli = dist.log_likelihood_sym(action_vars[i], dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient surr_objs.append(-tf.reduce_mean(logli * adv_vars[i])) input_list += obs_vars + action_vars + adv_vars + state_info_vars_list if j == 0: # For computing the fast update for sampling self.policy.set_init_surr_obj(input_list, surr_objs) init_input_list = input_list all_surr_objs.append(surr_objs) obs_vars, action_vars, adv_vars = self.make_vars('test') surr_objs = [] kls = [] for i in range(self.meta_batch_size): dist_info_vars, _ = self.policy.updated_dist_info_sym( i, all_surr_objs[-1][i], obs_vars[i], params_dict=new_params[i]) logli = dist.log_likelihood_sym(action_vars[i], dist_info_vars) surr_objs.append(-tf.reduce_mean(logli * adv_vars[i])) kls.append(dist.kl_sym(old_dist_info_vars[i], dist_info_vars)) surr_obj = tf.reduce_mean(tf.stack(surr_objs, 0)) mean_kl = tf.reduce_mean(tf.concat(kls, 0)) max_kl = tf.reduce_max(tf.concat(kls, 0)) input_list += obs_vars + action_vars + adv_vars if self.use_maml: self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list) else: # baseline method of just training initial policy self.optimizer.update_opt(loss=tf.reduce_mean( tf.stack(all_surr_objs[0], 0)), target=self.policy, inputs=init_input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def __init__( self, name, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=tf.tanh, gru_layer_cls=L.GRULayer, ): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ with tf.variable_scope(name): assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalGRUPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_dim else: input_dim = obs_dim l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = L.OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: tf.reshape( flat_feature, tf.pack([ tf.shape(input)[0], tf.shape(input)[1], feature_dim ])), shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)) prob_network = GRUNetwork(input_shape=(feature_dim, ), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, gru_layer_cls=gru_layer_cls, name="prob_network") self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = tf.placeholder(dtype=tf.float32, shape=(None, input_dim), name="flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output( l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_prob = tensor_utils.compile_function( [ flat_input_var, prob_network.step_prev_hidden_layer.input_var ], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ], {prob_network.step_input_layer: feature_var})) self.input_dim = input_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.prev_actions = None self.prev_hiddens = None self.dist = RecurrentCategorical(env_spec.action_space.n) out_layers = [prob_network.output_layer] if feature_network is not None: out_layers.append(feature_network.output_layer) LayersPowered.__init__(self, out_layers)
def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab_maml.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled :return: No return value. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = tf.gradients(loss, xs=params) for idx, (grad, param) in enumerate(zip(grads, params)): if grad is None: grads[idx] = tf.zeros_like(param) flat_grad = tensor_utils.flatten_tensor_variables(grads) # f=KL-divergence and target is policy self._hvp_approach.update_opt(f=constraint_term, target=target, inputs=inputs + extra_inputs, reg_coeff=self._reg_coeff) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name self._opt_fun = ext.lazydict( f_loss=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", ), f_grad=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", ), f_constraint=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", ), f_loss_constraint=lambda: tensor_utils.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", ), )
def __init__( self, name, input_shape, output_dim, prob_network=None, hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.tanh, optimizer=None, tr_optimizer=None, use_trust_region=True, step_size=0.01, normalize_inputs=True, no_initial_trust_region=True, ): """ :param input_shape: Shape of the input data. :param output_dim: Dimension of output. :param hidden_sizes: Number of hidden units of each layer of the mean network. :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. :param optimizer: Optimizer for minimizing the negative log-likelihood. :param use_trust_region: Whether to use trust region constraint. :param step_size: KL divergence constraint for each iteration """ Serializable.quick_init(self, locals()) with tf.variable_scope(name): if optimizer is None: optimizer = LbfgsOptimizer(name="optimizer") if tr_optimizer is None: tr_optimizer = ConjugateGradientOptimizer() self.output_dim = output_dim self.optimizer = optimizer self.tr_optimizer = tr_optimizer if prob_network is None: prob_network = MLP(input_shape=input_shape, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=tf.nn.softmax, name="prob_network") l_prob = prob_network.output_layer LayersPowered.__init__(self, [l_prob]) xs_var = prob_network.input_layer.input_var ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") old_prob_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="old_prob") x_mean_var = tf.get_variable(name="x_mean", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 0., dtype=tf.float32)) x_std_var = tf.get_variable(name="x_std", shape=(1, ) + input_shape, initializer=tf.constant_initializer( 1., dtype=tf.float32)) normalized_xs_var = (xs_var - x_mean_var) / x_std_var prob_var = L.get_output( l_prob, {prob_network.input_layer: normalized_xs_var}) old_info_vars = dict(prob=old_prob_var) info_vars = dict(prob=prob_var) dist = self._dist = Categorical(output_dim) mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars)) loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars)) predicted = tensor_utils.to_onehot_sym( tf.argmax(prob_var, dimension=1), output_dim) self.prob_network = prob_network self.f_predict = tensor_utils.compile_function([xs_var], predicted) self.f_prob = tensor_utils.compile_function([xs_var], prob_var) self.l_prob = l_prob self.optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var]) self.tr_optimizer.update_opt(loss=loss, target=self, network_outputs=[prob_var], inputs=[xs_var, ys_var, old_prob_var], leq_constraint=(mean_kl, step_size)) self.use_trust_region = use_trust_region self.name = name self.normalize_inputs = normalize_inputs self.x_mean_var = x_mean_var self.x_std_var = x_std_var self.first_optimized = not no_initial_trust_region
def __init__( self, name, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=tf.tanh, learn_std=True, init_std=1.0, output_nonlinearity=None, lstm_layer_cls=L.LSTMLayer, ): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ with tf.variable_scope(name): Serializable.quick_init(self, locals()) super(GaussianLSTMPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_dim else: input_dim = obs_dim l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = L.OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: tf.reshape( flat_feature, tf.pack([ tf.shape(input)[0], tf.shape(input)[1], feature_dim ])), shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)) mean_network = LSTMNetwork(input_shape=(feature_dim, ), input_layer=l_feature, output_dim=action_dim, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, lstm_layer_cls=lstm_layer_cls, name="mean_network") l_log_std = L.ParamLayer( mean_network.input_layer, num_units=action_dim, param=tf.constant_initializer(np.log(init_std)), name="output_log_std", trainable=learn_std, ) l_step_log_std = L.ParamLayer( mean_network.step_input_layer, num_units=action_dim, param=l_log_std.param, name="step_output_log_std", trainable=learn_std, ) self.mean_network = mean_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = tf.placeholder(dtype=tf.float32, shape=(None, input_dim), name="flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output( l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_mean_std = tensor_utils.compile_function( [ flat_input_var, mean_network.step_prev_hidden_layer.input_var, mean_network.step_prev_cell_layer.input_var ], L.get_output([ mean_network.step_output_layer, l_step_log_std, mean_network.step_hidden_layer, mean_network.step_cell_layer ], {mean_network.step_input_layer: feature_var})) self.l_log_std = l_log_std self.input_dim = input_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.prev_actions = None self.prev_hiddens = None self.prev_cells = None self.dist = RecurrentDiagonalGaussian(action_dim) out_layers = [mean_network.output_layer, l_log_std] if feature_network is not None: out_layers.append(feature_network.output_layer) LayersPowered.__init__(self, out_layers)