示例#1
0
class Network:
    def __init__(self, env, args):
        # TODO: Analogously to paac, your model should contain two components:
        # - actor, which predicts distribution over the actions
        # - critic, which predicts the value function
        #
        # The given states are tile encoded, so they are integral indices of
        # tiles intersecting the state. Therefore, you should convert them
        # to dense encoding (one-hot-like, with with `args.tiles` ones).
        # (Or you can even use embeddings for better efficiency.)
        #
        # The actor computes `mus` and `sds`, each of shape [batch_size, actions].
        # Compute each independently using states as input, adding a fully connected
        # layer with `args.hidden_layer_size` units and ReLU activation. Then:
        # - For `mus`, add a fully connected layer with `actions` outputs.
        #   To avoid `mus` moving from the required range, you should apply
        #   properly scaled `tf.tanh` activation.
        # - For `sds`, add a fully connected layer with `actions` outputs
        #   and `tf.nn.softplus` activation.
        #
        # The critic should be a usual one, passing states through one hidden
        # layer with `args.hidden_layer_size` ReLU units and then predicting
        # the value function.
        policy_in = tf.keras.Input(shape=args.tiles)
        x = tf.keras.layers.Embedding(env.observation_space.nvec[-1],
                                      args.hidden_layer_size,
                                      input_length=args.tiles)(policy_in)
        x = tf.keras.layers.GlobalAveragePooling1D(
            data_format="channels_last")(x)
        x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x)

        self.mu = tf.keras.layers.Dense(
            1, activation=lambda x: tf.constant(2.0) * tf.tanh(x))(x)
        self.sd = tf.keras.layers.Dense(
            1, activation=tf.keras.activations.softplus)(x)
        policy_out = tf.keras.layers.Concatenate()([self.mu, self.sd])

        self.actor = tf.keras.Model(policy_in, policy_out)
        self.policy_optimizer = RAdamOptimizer(args.learning_rate)

        value_in = tf.keras.Input(shape=args.tiles)
        x = tf.keras.layers.Embedding(env.observation_space.nvec[-1],
                                      args.hidden_layer_size,
                                      input_length=args.tiles)(value_in)
        x = tf.keras.layers.GlobalAveragePooling1D(
            data_format="channels_last")(x)
        x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x)
        value_out = tf.keras.layers.Dense(1)(x)
        self.critic = tf.keras.Model(value_in, value_out)
        self.critic.compile(optimizer=RAdamOptimizer(args.learning_rate),
                            loss=tf.keras.losses.MeanSquaredError())

    @wrappers.typed_np_function(np.float32, np.float32, np.float32)
    @tf.function
    def train(self, states, actions, returns):

        with tf.GradientTape() as critic_tape:
            pred_values = self.critic(states)
            critic_loss = self.critic.loss(returns, pred_values)

        critic_grads = critic_tape.gradient(critic_loss,
                                            self.critic.trainable_variables)
        self.critic.optimizer.apply_gradients(
            zip(critic_grads, self.critic.trainable_variables))

        with tf.GradientTape() as policy_tape:
            pred_actions = self.actor(states)
            mus = pred_actions[:, 0]
            sds = pred_actions[:, 1]

            # mus = tf.clip_by_value(mus, clip_value_min=-1, clip_value_max=1)
            # sds = tf.clip_by_value(sds, clip_value_min=0, clip_value_max=1)
            action_distribution = tfp.distributions.Normal(mus, sds)

            advantage = returns - pred_values[:, 0]
            nll = -action_distribution.log_prob(actions[:, 0])
            loss = nll * advantage
            policy_loss = tf.math.reduce_mean(loss)

            # entropy penalization
            entropy = tf.math.reduce_mean(tf.math.log(sds))
            # policy_loss -= args.beta * entropy

            # print(policy_loss)

        # print("Policy_loss", policy_loss)
        # print(self.actor.trainable_variables)
        policy_grad = policy_tape.gradient(policy_loss,
                                           self.actor.trainable_variables)
        # print(policy_grad)
        self.policy_optimizer.apply_gradients(
            zip(policy_grad, self.actor.trainable_variables))

        # TODO: Run the model on given `states` and compute
        # sds, mus and predicted values. Then create `action_distribution` using
        # `tfp.distributions.Normal` class and computed mus and sds.
        # In PyTorch, the corresponding class is `torch.distributions.normal.Normal`.
        #
        # TODO: Compute total loss as a sum of three losses:
        # - negative log likelihood of the `actions` in the `action_distribution`
        #   (using the `log_prob` method). You then need to sum the log probabilities
        #   of actions in a single batch example (using `tf.math.reduce_sum` with `axis=1`).
        #   Finally multiply the resulting vector by (returns - predicted values)
        #   and compute its mean. Note that the gradient must not flow through
        #   the predicted values (you can use `tf.stop_gradient` if necessary).
        # - negative value of the distribution entropy (use `entropy` method of
        #   the `action_distribution`) weighted by `args.entropy_regularization`.
        # - mean square error of the `returns` and predicted values.

    @wrappers.typed_np_function(np.float32)
    @tf.function
    def predict_actions(self, states):
        # TODO: Return predicted action distributions (mus and sds).
        mus_sds = tf.transpose(self.actor(states), (1, 0))
        # return tf.clip_by_value(mus_sds[0], -1, 1), tf.clip_by_value(mus_sds[1], 0, 1)
        return mus_sds

    @wrappers.typed_np_function(np.float32)
    @tf.function
    def predict_values(self, states):
        # TODO: Return predicted state-action values.
        return self.critic(states)[:, 0]
示例#2
0
    def load_model(self):
        # placeholders
        self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None])
        self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None])
        self.mems_i = [tf.compat.v1.placeholder(tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in
                       range(self.n_layer)]
        # model
        self.global_step = tf.compat.v1.train.get_or_create_global_step()

        initializer = tf.compat.v1.keras.initializers.glorot_normal()
        proj_initializer = tf.compat.v1.keras.initializers.glorot_normal()

        with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
            xx = tf.transpose(self.x, [1, 0])
            yy = tf.transpose(self.y, [1, 0])
            loss, self.logits, self.new_mem = modules.transformer(
                dec_inp=xx,
                target=yy,
                mems=self.mems_i,
                n_token=self.n_token,
                n_layer=self.n_layer,
                d_model=self.d_model,
                d_embed=self.d_embed,
                n_head=self.n_head,
                d_head=self.d_head,
                d_inner=self.d_ff,
                dropout=self.dropout,
                dropatt=self.dropout,
                initializer=initializer,
                proj_initializer=proj_initializer,
                is_training=self.is_training,
                mem_len=self.mem_len,
                rezero=self.rezero,
                cutoffs=[],
                div_val=-1,
                tie_projs=[],
                same_length=False,
                clamp_len=-1,
                input_perms=None,
                target_perms=None,
                head_target=None,
                untie_r=False,
                proj_same_dim=True)
            variables = tf.trainable_variables()
        grads = tf.gradients(self.loss, variables)
        grads_and_vars = list(zip(grads, variables))

        self.avg_loss = tf.reduce_mean(loss)
        # vars
        decay_lr = tf.compat.v1.train.cosine_decay(
            self.learning_rate,
            global_step=self.global_step,
            decay_steps=400000,
            alpha=0.004)

        optimizer = RAdamOptimizer(decay_lr)
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
        self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step)

        # saver
        self.saver = tf.compat.v1.train.Saver()
        config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
        self.sess = tf.compat.v1.Session(config=config)
        self.saver.restore(self.sess, self.checkpoint_path)