class PPOAgent(): def __init__(self, rows, columns, num_actions, l_rate=1e-4, gamma=0.99, lam=0.95, policy_kl_range=0.0008, policy_params=20, value_clip=1.0, loss_coefficient=1.0, entropy_coefficient=0.05): self.rows = rows self.columns = columns self.num_actions = num_actions self.actor = Actor(self.num_actions) self.critic = Critic() self.actor_old = Actor(self.num_actions) self.critic_old = Critic() self.optimizer = tf.keras.optimizers.Adam(l_rate) self.gamma = gamma self.lam = lam self.policy_kl_range = policy_kl_range self.policy_params = policy_params self.value_clip = value_clip self.loss_coefficient = loss_coefficient self.entropy_coefficient = entropy_coefficient @tf.function def fit(self, states, actions, rewards, next_states, dones): with tf.GradientTape() as tape: action_probabilities, values = self.actor(states), self.critic( states) old_action_probabilities, old_values = self.actor_old( states), self.critic_old(states) next_values = self.critic(next_states) loss = self._get_loss(action_probabilities, values, old_action_probabilities, old_values, next_values, actions, rewards, dones) grads = tape.gradient( loss, self.actor.trainable_variables + self.critic.trainable_variables) self.optimizer.apply_gradients( zip( grads, self.actor.trainable_variables + self.critic.trainable_variables)) def select_action(self, state, training=False): state_in = tf.expand_dims(state, axis=0) probabilities = self.actor(state_in) if training: distribution = tfp.distributions.Categorical(probs=probabilities) action = distribution.sample() action = int(action[0]) else: action = tf.argmax(probabilities[0]).numpy() return action def get_model(self): return self.actor def update_networks(self): self.actor_old.set_weights(self.actor.get_weights()) self.critic_old.set_weights(self.critic.get_weights()) def save_model_weights(self, actor_filename, critic_filename): self.actor.save_weights(actor_filename) self.critic.save_weights(critic_filename) def load_model_weights(self, actor_filename, critic_filename=None): self.actor(np.zeros((1, self.rows, self.columns, 1))) self.actor.load_weights(actor_filename) self.actor_old(np.zeros((1, self.rows, self.columns, 1))) self.actor_old.load_weights(actor_filename) if critic_filename is not None: self.critic(np.zeros((1, self.rows, self.columns, 1))) self.critic.load_weights(critic_filename) self.critic_old(np.zeros((1, self.rows, self.columns, 1))) self.critic_old.load_weights(critic_filename) def save_optimizer_weights(self, filename): np.save(filename, self.optimizer.get_weights()) def load_optimizer_weights(self, filename): optimizer_weights = np.load(filename, allow_pickle=True) model_weights = self.actor.trainable_weights + self.critic.trainable_variables zero_grads = [tf.zeros_like(w) for w in model_weights] self.optimizer.apply_gradients(zip(zero_grads, model_weights)) self.optimizer.set_weights(optimizer_weights) def _get_loss(self, action_probabilities, values, old_action_probabilities, old_values, next_values, actions, rewards, dones): old_values = tf.stop_gradient(old_values) advantages = self._generalized_advantages_estimation( values, rewards, next_values, dones) returns = tf.stop_gradient(advantages + values) advantages = \ tf.stop_gradient((advantages - tf.math.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-7)) log_probabilities = self._log_probabilities(action_probabilities, actions) old_log_probabilities = tf.stop_gradient( self._log_probabilities(old_action_probabilities, actions)) ratios = tf.math.exp(log_probabilities - old_log_probabilities) kl_divergence = self._kl_divergence(old_action_probabilities, action_probabilities) policy_gradient_loss = tf.where( tf.logical_and(kl_divergence >= self.policy_kl_range, ratios > 1), ratios * advantages - self.policy_params * kl_divergence, ratios * advantages) policy_gradient_loss = tf.math.reduce_mean(policy_gradient_loss) entropy = tf.math.reduce_mean(self._entropy(action_probabilities)) clipped_values = old_values + tf.clip_by_value( values - old_values, -self.value_clip, self.value_clip) values_losses = tf.math.square(returns - values) * 0.5 clipped_values_losses = tf.math.square(returns - clipped_values) * 0.5 critic_loss = tf.math.reduce_mean( tf.math.maximum(values_losses, clipped_values_losses)) loss = (critic_loss * self.loss_coefficient) - policy_gradient_loss - ( entropy * self.entropy_coefficient) return loss def _generalized_advantages_estimation(self, values, rewards, next_values, dones): gae = 0 advantages = [] delta = rewards + (1.0 - dones) * self.gamma * next_values - values for i in reversed(range(len(rewards))): gae = delta[i] + (1.0 - dones[i]) * self.gamma * self.lam * gae advantages.insert(0, gae) return tf.stack(advantages) def _log_probabilities(self, action_probabilities, actions): distribution = tfp.distributions.Categorical( probs=action_probabilities) return tf.expand_dims(distribution.log_prob(actions), axis=1) def _kl_divergence(self, probabilities1, probabilities2): distribution1 = tfp.distributions.Categorical(probs=probabilities1) distribution2 = tfp.distributions.Categorical(probs=probabilities2) return tf.expand_dims(tfp.distributions.kl_divergence( distribution1, distribution2), axis=1) def _entropy(self, probabilities): distribution = tfp.distributions.Categorical(probs=probabilities) return distribution.entropy()
class DDPG: """ Deep Deterministic Policy Gradient (DDPG) Helper Class""" def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001): """Initialization""" # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = env_dim self.gamma = gamma self.lr = lr # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = MemoryBuffer(buffer_size) def policy_action(self, s): """Use the actor to predict value""" return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """Use the Bellman Equation to compute the critic target""" critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer""" self.buffer.memorize(state, action, reward, done, new_state) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience""" # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array( grads).reshape((-1, self.act_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def train(self, env, summary_writer, nb_episodes=12, batch_size=64, render=False, gather_train_stats=False): results = [] # First, gather experience tqdm_e = tqdm(range(nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] noise = OrnsteinUhlenbeckProcess(size=self.act_dim) while not done: if render: env.render() # Actor picks an action (following the deterministic policy) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a+noise.generate(time), - self.act_range, self.act_range) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Add outputs to memory buffer self.memorize(old_state, a, r, done, new_state) # Sample experience from buffer states, actions, rewards, dones, new_states, _ = self.sample_batch( batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) # Update current state old_state = new_state cumul_reward += r time += 1 # Gather stats every episode for plotting if(gather_train_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tf_summary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results def save_weights(self, path): path += '_LR_{}'.format(self.lr) self.actor.save(path) self.critic.save(path) def load_weights(self, path_actor, path_critic): self.critic.load_weights(path_critic) self.actor.load_weights(path_actor)