def build_graph(self): """ Builds computational graph for policy """ with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): # build the actual policy network self.input_var, self.cnn_output_var = create_cnn( name='cnn', hidden_nonlinearity=self.hidden_nonlinearity, kernel_sizes=self.kernel_sizes, strides=self.strides, num_filters=self.num_filters, input_dim=(None, ) + self.input_dim, input_var=self.input_var, ) _, self.output_var = create_mlp( name='mlp', output_dim=self.output_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_var=self.cnn_output_var, batch_normalization=self.batch_normalization, ) current_scope = tf.get_default_graph().get_name_scope() trainable_policy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self._params = OrderedDict([ (remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars ])
def build_graph(self): """ Builds computational graph for policy """ with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): # build the actual policy network args = create_rnn( name='rnn', cell_type=self._cell_type, output_dim=self.output_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_dim=( None, None, self.input_dim, ), input_var=self.input_var, state_var=self.state_var, ) self.input_var, self.state_var, self.output_var, self.next_state_var, self.cell = args current_scope = tf.get_default_graph().get_name_scope() trainable_policy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self._params = OrderedDict([ (remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars ])
def build_graph(self): """ Builds computational graph for policy """ # with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): # build the actual policy network self.input_var, self.output_var = create_mlp( name='mlp', output_dim=self.output_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_dim=( None, self.input_dim, ), input_var=self.input_var, batch_normalization=self.batch_normalization, ) # save the policy's trainable variables in dicts # current_scope = tf.get_default_graph().get_name_scope() current_scope = self.name trainable_policy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self._params = OrderedDict([ (remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars ])
def _create_placeholders_for_vars(self, scope, graph_keys=tf.GraphKeys.TRAINABLE_VARIABLES): var_list = tf.get_collection(graph_keys, scope=scope) placeholders = [] for var in var_list: var_name = remove_scope_from_name(var.name, scope.split('/')[0]) placeholders.append((var_name, tf.placeholder(tf.float32, shape=var.shape, name="%s_ph" % var_name))) return OrderedDict(placeholders)
def build_graph(self): """ Builds computational graph for policy """ with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): # build the actual policy network self.obs_var, self.mean_var = create_mlp( name='mean_network', output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_dim=( None, self.obs_dim, ), ) with tf.variable_scope("log_std_network", reuse=tf.AUTO_REUSE): log_std_var = tf.get_variable( name='log_std_var', shape=( 1, self.action_dim, ), dtype=tf.float32, initializer=tf.constant_initializer(self.init_log_std), trainable=self.learn_std, ) self.log_std_var = tf.maximum(log_std_var, self.min_log_std, name='log_std') # symbolically define sampled action and distribution self.action_var = self.mean_var + tf.random_normal( shape=tf.shape(self.mean_var)) * tf.exp(log_std_var) self._dist = DiagonalGaussian(self.action_dim) # save the policy's trainable variables in dicts # current_scope = tf.get_default_graph().get_name_scope() current_scope = self.name trainable_policy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self.policy_params = OrderedDict([ (remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars ]) self.policy_params_ph = self._create_placeholders_for_vars( scope=self.name + "/mean_network") log_std_network_phs = self._create_placeholders_for_vars( scope=self.name + "/log_std_network") self.policy_params_ph.update(log_std_network_phs) self.policy_params_keys = self.policy_params_ph.keys()
def build_graph(self): """ Builds computational graph for policy """ with tf.variable_scope(self.name): # build the actual policy network rnn_outs = create_rnn( name='mean_network', cell_type=self._cell_type, output_dim=self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_dim=( None, None, self.obs_dim, ), ) self.obs_var, self.hidden_var, self.mean_var, self.next_hidden_var, self.cell = rnn_outs with tf.variable_scope("log_std_network"): log_std_var = tf.get_variable( name='log_std_var', shape=( 1, self.action_dim, ), dtype=tf.float32, initializer=tf.constant_initializer(self.init_log_std), trainable=self.learn_std) self.log_std_var = tf.maximum(log_std_var, self.min_log_std, name='log_std') # symbolically define sampled action and distribution self._dist = DiagonalGaussian(self.action_dim) # save the policy's trainable variables in dicts current_scope = tf.get_default_graph().get_name_scope() trainable_policy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self.policy_params = OrderedDict([ (remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars ])
def build_graph(self): """ Builds computational graph for policy """ with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): # build the actual policy network self.obs_var, self.output_var = create_mlp( name='network', output_dim=2 * self.action_dim, hidden_sizes=self.hidden_sizes, hidden_nonlinearity=self.hidden_nonlinearity, output_nonlinearity=self.output_nonlinearity, input_dim=( None, self.obs_dim, ), ) self.mean_var, self.log_std_var = tf.split(self.output_var, 2, axis=-1) self.log_std_var = tf.clip_by_value(self.log_std_var, LOG_SIG_MIN, LOG_SIG_MAX, name='log_std') # symbolically define sampled action and distribution self.action_var = self.mean_var + tf.random_normal( shape=tf.shape(self.mean_var)) * tf.exp(self.log_std_var) self._dist = DiagonalGaussian(self.action_dim, squashed=self.squashed) # save the policy's trainable variables in dicts current_scope = self.name trainable_policy_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=current_scope) self.policy_params = OrderedDict([ (remove_scope_from_name(var.name, current_scope), var) for var in trainable_policy_vars ]) self.policy_params_ph = self._create_placeholders_for_vars( scope=self.name + "/network") self.policy_params_keys = self.policy_params_ph.keys()