Exemplo n.º 1
0
    def train(self, obs_n, act_n):
        with tf.GradientTape() as tape:
            x = obs_n[self.agent_index]
            for idx in range(self.num_layers):
                x = self.hidden_layers[idx](x)
            x = self.output_layer(x)
            act_n = tf.unstack(act_n)
            if self.use_gumbel:
                logits = x  # log probabilities of the gumbel softmax dist are the output of the network
                act_n[self.agent_index] = self.gumbel_softmax_sample(logits)
                act_probs = tf.math.softmax(logits)
                entropy = -tf.math.reduce_sum(
                    act_probs * tf.math.log(act_probs + self.numeric_eps), 1)
            elif self.use_gaussian:
                logits = x
                act_n[self.agent_index] = self.gaussian_sample(logits)
                entropy = -self.action_logprob(obs_n[self.agent_index],
                                               act_n[self.agent_index])
            q_value = self.q_network._predict_internal(obs_n + act_n)

            loss = -tf.math.reduce_mean(q_value + self.entropy_coeff * entropy)

        gradients = tape.gradient(loss, self.model.trainable_variables)
        local_clipped = clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(
            zip(local_clipped, self.model.trainable_variables))
        return loss
Exemplo n.º 2
0
    def _train_step_internal(self, concatenated_input, target_q, weights):
        """
        Internal function, because concatenation can not be done inside tf.function
        """
        with tf.GradientTape() as tape:
            x = self.input_concat_layer(concatenated_input)
            for idx in range(self.num_layers):
                x = self.hidden_layers[idx](x)
            q_pred = self.output_layer(x)
            td_loss = tf.math.square(target_q - q_pred)
            loss = tf.reduce_mean(td_loss * weights)

        gradients = tape.gradient(loss, self.model.trainable_variables)

        local_clipped = clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(zip(local_clipped, self.model.trainable_variables))

        return td_loss
Exemplo n.º 3
0
    def train(self, obs_n, act_n):
        with tf.GradientTape() as tape:
            x = self.forward_pass(obs_n[self.agent_index])
            act_n = tf.unstack(act_n)
            if self.use_gumbel:
                logits = x  # log probabilities of the gumbel softmax dist are the output of the network
                act_n[self.agent_index] = self.gumbel_softmax_sample(logits)
            else:
                act_n[self.agent_index] = x
            q_value = self.q_network._predict_internal(obs_n + act_n)
            policy_regularization = tf.math.reduce_mean(tf.math.square(x))
            loss = -tf.math.reduce_mean(q_value) + 1e-3 * policy_regularization  # gradient plus regularization

        gradients = tape.gradient(loss, self.model.trainable_variables)  # todo not sure if this really works
        # gradients = tf.clip_by_global_norm(gradients, self.clip_norm)[0]
        local_clipped = clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(zip(local_clipped, self.model.trainable_variables))
        return loss
Exemplo n.º 4
0
    def _train_step_internal(self, concatenated_input, target_prob, weights):
        """
        Internal function, because concatenation can not be done inside tf.function.
        """
        with tf.GradientTape(persistent=True) as tape:
            x = self.input_concat_layer(concatenated_input)
            for idx in range(self.num_layers):
                x = self.hidden_layers[idx](x)
            x = self.output_layer(x)
            q_pred = x

            crossent_loss = tf.losses.binary_crossentropy(target_prob, q_pred)
            loss = crossent_loss

        gradients = tape.gradient(loss, self.model.trainable_variables)

        gradients = clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        return crossent_loss
Exemplo n.º 5
0
    def train_step(self, obs_n, target, weights):
        """
        Train the value function estimator, for one gradient step. With clipped gradients.
        Internal function, because concatenation can not be done inside tf.function
        """
        with tf.GradientTape() as tape:
            x = self.input_concat_layer(obs_n) if len(obs_n) > 1 else obs_n[0]
            for idx in range(self.num_layers):
                x = self.hidden_layers[idx](x)
            v_pred = self.output_layer(x)
            td_loss = tf.math.square(target - v_pred)
            loss = tf.reduce_mean(td_loss * weights)

        gradients = tape.gradient(loss, self.model.trainable_variables)

        local_clipped = clip_by_local_norm(gradients, self.clip_norm)
        self.optimizer.apply_gradients(
            zip(local_clipped, self.model.trainable_variables))

        return td_loss