def build_graph(self): """ Builds computational graph for policy """ with tf.variable_scope(self.name): # build the actual policy network self.obs_var, self.mean_var = create_mlp(name='mean_network', output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_dim=(None, self.obs_dim,) ) with tf.variable_scope("log_std_network"): log_std_var = tf.get_variable(name='log_std_var', shape=(1, self.action_dim,), dtype=tf.float32, initializer=tf.constant_initializer(self.init_log_std), trainable=self.learn_std ) self.log_std_var = tf.maximum(log_std_var, self.min_log_std, name='log_std') # symbolically define sampled action and distribution self.action_var = self.mean_var + tf.random_normal(shape=tf.shape(self.mean_var)) * tf.exp(log_std_var) self._dist = DiagonalGaussian(self.action_dim) # save the policy's trainable variables in dicts current_scope = tf.get_default_graph().get_name_scope() trainable_policy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self.policy_params = OrderedDict([(remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars])
def build_graph(self): with tf.variable_scope(self.name): # build the actual policy network self.obs_var, self.prob_var = create_mlp( name='prob_network', output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_dim=( None, self.obs_dim, )) # symbolically define sampled action and distribution self.action_var = tf.random.categorical(tf.log(self.prob_var), 1) self._dist = Categorical(self.action_dim) # save the policy's trainable variables in dicts current_scope = tf.get_default_graph().get_name_scope() trainable_policy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self.policy_params = OrderedDict([ (remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars ])
def distribution_info_sym(self, obs_var, params=None): """ Return the symbolic distribution information about the actions. Args: obs_var (placeholder) : symbolic variable for observations params (dict) : a dictionary of placeholders or vars with the parameters of the MLP Returns: (dict) : a dictionary of tf placeholders for the policy output distribution """ if params is None: with tf.variable_scope(self.name): obs_var, mean_var = create_mlp( name='mean_network', output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=obs_var, reuse=True, ) log_std_var = self.log_std_var else: mean_network_params = OrderedDict() log_std_network_params = [] for name, param in params.items(): if 'log_std_network' in name: log_std_network_params.append(param) else: # if 'mean_network' in name: mean_network_params[name] = param assert len(log_std_network_params) == 1 obs_var, mean_var = forward_mlp( output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=obs_var, mlp_params=mean_network_params, ) log_std_var = log_std_network_params[0] return dict(mean=mean_var, log_std=log_std_var)
def distribution_info_sym(self, obs_var, params=None): """ Return the symbolic distribution information about the actions. Args: obs_var (placeholder) : symbolic variable for observations params (dict) : a dictionary of placeholders or vars with the parameters of the MLP Returns: (dict) : a dictionary of tf placeholders for the policy output distribution """ if params is None: with tf.variable_scope(self.name): obs_var, prob_var = create_mlp( name='prob_network', output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=obs_var, reuse=True, ) else: prob_network_params = OrderedDict() for name, param in params.items(): prob_network_params[name] = param obs_var, prob_var = forward_mlp( output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=obs_var, mlp_params=prob_network_params, ) return dict(prob=prob_var)