class Network: def __init__(self, env, args): # TODO: Analogously to paac, your model should contain two components: # - actor, which predicts distribution over the actions # - critic, which predicts the value function # # The given states are tile encoded, so they are integral indices of # tiles intersecting the state. Therefore, you should convert them # to dense encoding (one-hot-like, with with `args.tiles` ones). # (Or you can even use embeddings for better efficiency.) # # The actor computes `mus` and `sds`, each of shape [batch_size, actions]. # Compute each independently using states as input, adding a fully connected # layer with `args.hidden_layer_size` units and ReLU activation. Then: # - For `mus`, add a fully connected layer with `actions` outputs. # To avoid `mus` moving from the required range, you should apply # properly scaled `tf.tanh` activation. # - For `sds`, add a fully connected layer with `actions` outputs # and `tf.nn.softplus` activation. # # The critic should be a usual one, passing states through one hidden # layer with `args.hidden_layer_size` ReLU units and then predicting # the value function. policy_in = tf.keras.Input(shape=args.tiles) x = tf.keras.layers.Embedding(env.observation_space.nvec[-1], args.hidden_layer_size, input_length=args.tiles)(policy_in) x = tf.keras.layers.GlobalAveragePooling1D( data_format="channels_last")(x) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x) self.mu = tf.keras.layers.Dense( 1, activation=lambda x: tf.constant(2.0) * tf.tanh(x))(x) self.sd = tf.keras.layers.Dense( 1, activation=tf.keras.activations.softplus)(x) policy_out = tf.keras.layers.Concatenate()([self.mu, self.sd]) self.actor = tf.keras.Model(policy_in, policy_out) self.policy_optimizer = RAdamOptimizer(args.learning_rate) value_in = tf.keras.Input(shape=args.tiles) x = tf.keras.layers.Embedding(env.observation_space.nvec[-1], args.hidden_layer_size, input_length=args.tiles)(value_in) x = tf.keras.layers.GlobalAveragePooling1D( data_format="channels_last")(x) x = tf.keras.layers.Dense(args.hidden_layer_size, activation='relu')(x) value_out = tf.keras.layers.Dense(1)(x) self.critic = tf.keras.Model(value_in, value_out) self.critic.compile(optimizer=RAdamOptimizer(args.learning_rate), loss=tf.keras.losses.MeanSquaredError()) @wrappers.typed_np_function(np.float32, np.float32, np.float32) @tf.function def train(self, states, actions, returns): with tf.GradientTape() as critic_tape: pred_values = self.critic(states) critic_loss = self.critic.loss(returns, pred_values) critic_grads = critic_tape.gradient(critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients( zip(critic_grads, self.critic.trainable_variables)) with tf.GradientTape() as policy_tape: pred_actions = self.actor(states) mus = pred_actions[:, 0] sds = pred_actions[:, 1] # mus = tf.clip_by_value(mus, clip_value_min=-1, clip_value_max=1) # sds = tf.clip_by_value(sds, clip_value_min=0, clip_value_max=1) action_distribution = tfp.distributions.Normal(mus, sds) advantage = returns - pred_values[:, 0] nll = -action_distribution.log_prob(actions[:, 0]) loss = nll * advantage policy_loss = tf.math.reduce_mean(loss) # entropy penalization entropy = tf.math.reduce_mean(tf.math.log(sds)) # policy_loss -= args.beta * entropy # print(policy_loss) # print("Policy_loss", policy_loss) # print(self.actor.trainable_variables) policy_grad = policy_tape.gradient(policy_loss, self.actor.trainable_variables) # print(policy_grad) self.policy_optimizer.apply_gradients( zip(policy_grad, self.actor.trainable_variables)) # TODO: Run the model on given `states` and compute # sds, mus and predicted values. Then create `action_distribution` using # `tfp.distributions.Normal` class and computed mus and sds. # In PyTorch, the corresponding class is `torch.distributions.normal.Normal`. # # TODO: Compute total loss as a sum of three losses: # - negative log likelihood of the `actions` in the `action_distribution` # (using the `log_prob` method). You then need to sum the log probabilities # of actions in a single batch example (using `tf.math.reduce_sum` with `axis=1`). # Finally multiply the resulting vector by (returns - predicted values) # and compute its mean. Note that the gradient must not flow through # the predicted values (you can use `tf.stop_gradient` if necessary). # - negative value of the distribution entropy (use `entropy` method of # the `action_distribution`) weighted by `args.entropy_regularization`. # - mean square error of the `returns` and predicted values. @wrappers.typed_np_function(np.float32) @tf.function def predict_actions(self, states): # TODO: Return predicted action distributions (mus and sds). mus_sds = tf.transpose(self.actor(states), (1, 0)) # return tf.clip_by_value(mus_sds[0], -1, 1), tf.clip_by_value(mus_sds[1], 0, 1) return mus_sds @wrappers.typed_np_function(np.float32) @tf.function def predict_values(self, states): # TODO: Return predicted state-action values. return self.critic(states)[:, 0]
def load_model(self): # placeholders self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.mems_i = [tf.compat.v1.placeholder(tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in range(self.n_layer)] # model self.global_step = tf.compat.v1.train.get_or_create_global_step() initializer = tf.compat.v1.keras.initializers.glorot_normal() proj_initializer = tf.compat.v1.keras.initializers.glorot_normal() with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): xx = tf.transpose(self.x, [1, 0]) yy = tf.transpose(self.y, [1, 0]) loss, self.logits, self.new_mem = modules.transformer( dec_inp=xx, target=yy, mems=self.mems_i, n_token=self.n_token, n_layer=self.n_layer, d_model=self.d_model, d_embed=self.d_embed, n_head=self.n_head, d_head=self.d_head, d_inner=self.d_ff, dropout=self.dropout, dropatt=self.dropout, initializer=initializer, proj_initializer=proj_initializer, is_training=self.is_training, mem_len=self.mem_len, rezero=self.rezero, cutoffs=[], div_val=-1, tie_projs=[], same_length=False, clamp_len=-1, input_perms=None, target_perms=None, head_target=None, untie_r=False, proj_same_dim=True) variables = tf.trainable_variables() grads = tf.gradients(self.loss, variables) grads_and_vars = list(zip(grads, variables)) self.avg_loss = tf.reduce_mean(loss) # vars decay_lr = tf.compat.v1.train.cosine_decay( self.learning_rate, global_step=self.global_step, decay_steps=400000, alpha=0.004) optimizer = RAdamOptimizer(decay_lr) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step) # saver self.saver = tf.compat.v1.train.Saver() config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 self.sess = tf.compat.v1.Session(config=config) self.saver.restore(self.sess, self.checkpoint_path)