def _build_module(self, input_layer): # Standard V Network q_outputs = [] self.target = tf.placeholder(tf.float32, shape=(None, 1), name="q_networks_min_placeholder") for i in range( input_layer.shape[0] ): # assuming that the actual size is 2, as there are two critic networks if self.initializer == 'normalized_columns': q_outputs.append( self.dense_layer(1)( input_layer[i], name='q_output_{}'.format(i + 1), kernel_initializer=normalized_columns_initializer(1.0), bias_initializer=self.output_bias_initializer), ) elif self.initializer == 'xavier' or self.initializer is None: q_outputs.append( self.dense_layer(1)( input_layer[i], name='q_output_{}'.format(i + 1), bias_initializer=self.output_bias_initializer)) self.output.append(q_outputs[i]) self.loss.append(tf.reduce_mean((self.target - q_outputs[i])**2)) self.output.append(tf.reduce_min(q_outputs, axis=0)) self.output.append(tf.reduce_mean(self.output[0])) self.loss = sum(self.loss) tf.losses.add_loss(self.loss)
def _build_continuous_net(self, input_layer, action_space): num_actions = action_space.shape[0] self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions") self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean") self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std") self.input = [self.actions, self.old_policy_mean, self.old_policy_std] self.policy_mean = self.dense_layer(num_actions)(input_layer, name='policy_mean', kernel_initializer=normalized_columns_initializer(0.01)) # for local networks in distributed settings, we need to move variables we create manually to the # tf.GraphKeys.LOCAL_VARIABLES collection, since the variable scope custom getter which is set in # Architecture does not apply to them if self.is_local and isinstance(self.ap.task_parameters, DistributedTaskParameters): self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32', collections=[tf.GraphKeys.LOCAL_VARIABLES], name="policy_log_std") else: self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32', name="policy_log_std") self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std') # define the distributions for the policy and the old policy self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps) self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps) self.output = [self.policy_mean, self.policy_std]
def _build_module(self, input_layer): # Standard V Network self.output = tf.layers.dense( input_layer, 1, name='output', kernel_initializer=normalized_columns_initializer(1.0))
def _build_module(self, input_layer): # Standard V Network if self.initializer == 'normalized_columns': self.output = self.dense_layer(1)( input_layer, name='output', kernel_initializer=normalized_columns_initializer(1.0)) elif self.initializer == 'xavier' or self.initializer is None: self.output = self.dense_layer(1)(input_layer, name='output')
def _build_continuous_net(self, input_layer, action_space): num_actions = action_space.shape self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions")) # output activation function if np.all(self.spaces.action.max_abs_range < np.inf): # bounded actions self.output_scale = action_space.max_abs_range self.continuous_output_activation = self.activation_function else: # unbounded actions self.output_scale = 1 self.continuous_output_activation = None # mean pre_activation_policy_values_mean = self.dense_layer(num_actions)(input_layer, name='fc_mean') policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean) self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean') self.output.append(self.policy_mean) # standard deviation if isinstance(self.exploration_policy, ContinuousEntropyParameters): # the stdev is an output of the network and uses a softplus activation as defined in A3C policy_values_std = self.dense_layer(num_actions)(input_layer, kernel_initializer=normalized_columns_initializer(0.01), name='fc_std') self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps self.output.append(self.policy_std) else: # the stdev is an externally given value # Warning: we need to explicitly put this variable in the local variables collections, since defining # it as not trainable puts it for some reason in the global variables collections. If this is not done, # the variable won't be initialized and when working with multiple workers they will get stuck. self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False, name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES]) # assign op for the policy std self.policy_std_placeholder = tf.placeholder('float32', (num_actions,)) self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder) # define the distributions for the policy and the old policy policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std) self.policy_distributions.append(policy_distribution) if self.is_local: # add a squared penalty on the squared pre-activation features of the action if self.action_penalty and self.action_penalty != 0: self.regularizations += [ self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
def _build_module(self, input_layer): self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values") self.input = [self.old_policy_value] self.output = self.dense_layer(1)( input_layer, name='output', kernel_initializer=normalized_columns_initializer(1.0)) self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return") value_loss_1 = tf.square(self.output - self.target) value_loss_2 = tf.square(self.old_policy_value + tf.clip_by_value( self.output - self.old_policy_value, -self.clip_likelihood_ratio_using_epsilon, self.clip_likelihood_ratio_using_epsilon) - self.target) self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2)) self.loss = self.vf_loss tf.losses.add_loss(self.loss)
def _build_continuous_net(self, input_layer, action_space): num_actions = action_space.shape[0] self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions") self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean") self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std") self.input = [self.actions, self.old_policy_mean, self.old_policy_std] self.policy_mean = self.dense_layer(num_actions)(input_layer, name='policy_mean', kernel_initializer=normalized_columns_initializer(0.01)) if self.is_local: self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32', collections=[tf.GraphKeys.LOCAL_VARIABLES]) else: self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32') self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std') # define the distributions for the policy and the old policy self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps) self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps) self.output = [self.policy_mean, self.policy_std]