Exemplo n.º 1
0
        def build_action_value(prefix: str,
                               model_out: TensorType) -> List[TensorType]:
            if q_hiddens:
                action_out = model_out
                for i in range(len(q_hiddens)):
                    if use_noisy:
                        action_out = NoisyLayer(
                            "{}hidden_{}".format(prefix, i), q_hiddens[i],
                            sigma0)(action_out)
                    elif add_layer_norm:
                        action_out = tf.keras.layers.Dense(
                            units=q_hiddens[i],
                            activation=tf.nn.relu)(action_out)
                        action_out = \
                            tf.keras.layers.LayerNormalization()(
                                action_out)
                    else:
                        action_out = tf.keras.layers.Dense(
                            units=q_hiddens[i],
                            activation=tf.nn.relu,
                            name="hidden_%d" % i)(action_out)
            else:
                # Avoid postprocessing the outputs. This enables custom models
                # to be used for parametric action DQN.
                action_out = model_out

            if use_noisy:
                action_scores = NoisyLayer("{}output".format(prefix),
                                           self.action_space.n * num_atoms,
                                           sigma0,
                                           activation=None)(action_out)
            elif q_hiddens:
                action_scores = tf.keras.layers.Dense(
                    units=self.action_space.n * num_atoms,
                    activation=None)(action_out)
            else:
                action_scores = model_out

            if num_atoms > 1:
                # Distributional Q-learning uses a discrete support z
                # to represent the action value distribution
                z = tf.range(num_atoms, dtype=tf.float32)
                z = v_min + z * (v_max - v_min) / float(num_atoms - 1)

                def _layer(x):
                    support_logits_per_action = tf.reshape(
                        tensor=x, shape=(-1, self.action_space.n, num_atoms))
                    support_prob_per_action = tf.nn.softmax(
                        logits=support_logits_per_action)
                    x = tf.reduce_sum(input_tensor=z * support_prob_per_action,
                                      axis=-1)
                    logits = support_logits_per_action
                    dist = support_prob_per_action
                    return [x, z, support_logits_per_action, logits, dist]

                return tf.keras.layers.Lambda(_layer)(action_scores)
            else:
                logits = tf.expand_dims(tf.ones_like(action_scores), -1)
                dist = tf.expand_dims(tf.ones_like(action_scores), -1)
                return [action_scores, logits, dist]
Exemplo n.º 2
0
 def build_state_score(prefix: str,
                       model_out: TensorType) -> TensorType:
     state_out = model_out
     for i in range(len(q_hiddens)):
         if use_noisy:
             state_out = NoisyLayer(
                 "{}dueling_hidden_{}".format(prefix, i), q_hiddens[i],
                 sigma0)(state_out)
         else:
             state_out = tf.keras.layers.Dense(
                 units=q_hiddens[i], activation=tf.nn.relu)(state_out)
             if add_layer_norm:
                 state_out = tf.keras.layers.LayerNormalization()(
                     state_out)
     if use_noisy:
         state_score = NoisyLayer(
             "{}dueling_output".format(prefix),
             num_atoms,
             sigma0,
             activation=None,
         )(state_out)
     else:
         state_score = tf.keras.layers.Dense(units=num_atoms,
                                             activation=None)(state_out)
     return state_score