def _build_module(self, input_layer):
        # Standard V Network
        q_outputs = []
        self.target = tf.placeholder(tf.float32,
                                     shape=(None, 1),
                                     name="q_networks_min_placeholder")

        for i in range(
                input_layer.shape[0]
        ):  # assuming that the actual size is 2, as there are two critic networks
            if self.initializer == 'normalized_columns':
                q_outputs.append(
                    self.dense_layer(1)(
                        input_layer[i],
                        name='q_output_{}'.format(i + 1),
                        kernel_initializer=normalized_columns_initializer(1.0),
                        bias_initializer=self.output_bias_initializer), )
            elif self.initializer == 'xavier' or self.initializer is None:
                q_outputs.append(
                    self.dense_layer(1)(
                        input_layer[i],
                        name='q_output_{}'.format(i + 1),
                        bias_initializer=self.output_bias_initializer))

            self.output.append(q_outputs[i])
            self.loss.append(tf.reduce_mean((self.target - q_outputs[i])**2))

        self.output.append(tf.reduce_min(q_outputs, axis=0))
        self.output.append(tf.reduce_mean(self.output[0]))
        self.loss = sum(self.loss)
        tf.losses.add_loss(self.loss)
Пример #2
0
    def _build_continuous_net(self, input_layer, action_space):
        num_actions = action_space.shape[0]
        self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")

        self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
        self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")

        self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
        self.policy_mean = self.dense_layer(num_actions)(input_layer, name='policy_mean',
                                           kernel_initializer=normalized_columns_initializer(0.01))

        # for local networks in distributed settings, we need to move variables we create manually to the
        # tf.GraphKeys.LOCAL_VARIABLES collection, since the variable scope custom getter which is set in
        # Architecture does not apply to them
        if self.is_local and isinstance(self.ap.task_parameters, DistributedTaskParameters):
            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
                                             collections=[tf.GraphKeys.LOCAL_VARIABLES], name="policy_log_std")
        else:
            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32', name="policy_log_std")

        self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')

        # define the distributions for the policy and the old policy
        self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
        self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)

        self.output = [self.policy_mean, self.policy_std]
Пример #3
0
 def _build_module(self, input_layer):
     # Standard V Network
     self.output = tf.layers.dense(
         input_layer,
         1,
         name='output',
         kernel_initializer=normalized_columns_initializer(1.0))
Пример #4
0
 def _build_module(self, input_layer):
     # Standard V Network
     if self.initializer == 'normalized_columns':
         self.output = self.dense_layer(1)(
             input_layer,
             name='output',
             kernel_initializer=normalized_columns_initializer(1.0))
     elif self.initializer == 'xavier' or self.initializer is None:
         self.output = self.dense_layer(1)(input_layer, name='output')
Пример #5
0
    def _build_continuous_net(self, input_layer, action_space):
        num_actions = action_space.shape
        self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions"))

        # output activation function
        if np.all(self.spaces.action.max_abs_range < np.inf):
            # bounded actions
            self.output_scale = action_space.max_abs_range
            self.continuous_output_activation = self.activation_function
        else:
            # unbounded actions
            self.output_scale = 1
            self.continuous_output_activation = None

        # mean
        pre_activation_policy_values_mean = self.dense_layer(num_actions)(input_layer, name='fc_mean')
        policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean)
        self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')

        self.output.append(self.policy_mean)

        # standard deviation
        if isinstance(self.exploration_policy, ContinuousEntropyParameters):
            # the stdev is an output of the network and uses a softplus activation as defined in A3C
            policy_values_std = self.dense_layer(num_actions)(input_layer,
                                                              kernel_initializer=normalized_columns_initializer(0.01),
                                                              name='fc_std')
            self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps

            self.output.append(self.policy_std)
        else:
            # the stdev is an externally given value
            # Warning: we need to explicitly put this variable in the local variables collections, since defining
            # it as not trainable puts it for some reason in the global variables collections. If this is not done,
            # the variable won't be initialized and when working with multiple workers they will get stuck.
            self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False,
                                          name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES])

            # assign op for the policy std
            self.policy_std_placeholder = tf.placeholder('float32', (num_actions,))
            self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder)

        # define the distributions for the policy and the old policy
        policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std)
        self.policy_distributions.append(policy_distribution)

        if self.is_local:
            # add a squared penalty on the squared pre-activation features of the action
            if self.action_penalty and self.action_penalty != 0:
                self.regularizations += [
                    self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
Пример #6
0
    def _build_module(self, input_layer):
        self.old_policy_value = tf.placeholder(tf.float32, [None],
                                               "old_policy_values")
        self.input = [self.old_policy_value]
        self.output = self.dense_layer(1)(
            input_layer,
            name='output',
            kernel_initializer=normalized_columns_initializer(1.0))
        self.target = self.total_return = tf.placeholder(tf.float32, [None],
                                                         name="total_return")

        value_loss_1 = tf.square(self.output - self.target)
        value_loss_2 = tf.square(self.old_policy_value + tf.clip_by_value(
            self.output -
            self.old_policy_value, -self.clip_likelihood_ratio_using_epsilon,
            self.clip_likelihood_ratio_using_epsilon) - self.target)
        self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2))
        self.loss = self.vf_loss
        tf.losses.add_loss(self.loss)
Пример #7
0
    def _build_continuous_net(self, input_layer, action_space):
        num_actions = action_space.shape[0]
        self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")

        self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
        self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")

        self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
        self.policy_mean = self.dense_layer(num_actions)(input_layer, name='policy_mean',
                                           kernel_initializer=normalized_columns_initializer(0.01))
        if self.is_local:
            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
                                            collections=[tf.GraphKeys.LOCAL_VARIABLES])
        else:
            self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32')

        self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')

        # define the distributions for the policy and the old policy
        self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
        self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)

        self.output = [self.policy_mean, self.policy_std]