class Agent: def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, hd1=400, hd2=300, batch_size=64, noise=0.1): self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_actions = n_actions self.noise = noise self.memory = MemoryBuffer(max_size) self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = Actor(n_actions=n_actions) self.critic = Critic() self.target_actor = Actor(n_actions=n_actions) self.target_critic = Critic() self.actor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha)) self.critic.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=beta)) self.target_actor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha)) self.target_critic.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=alpha)) self.update_weights() def remember(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) def train(self): cl, al = self.learn() if cl is not None: self.update_weights() return cl, al def update_weights(self, tau=None): if tau is None: tau = self.tau weights = [] targets = self.target_actor.weights for i, weight in enumerate(self.actor.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_actor.set_weights(weights) weights = [] targets = self.target_critic.weights for i, weight in enumerate(self.critic.weights): weights.append(weight * tau + targets[i] * (1 - tau)) self.target_critic.set_weights(weights) def choose_action(self, observation, evaluate=False): state = tf.convert_to_tensor([observation], dtype=tf.float32) actions = self.actor(state) if not evaluate: actions += tf.random.normal(shape=[self.n_actions], mean=0.0, stddev=self.noise) actions = tf.clip_by_value(actions, self.min_action, self.max_action) return actions[0] # @tf.function def learn(self): if len(self.memory) < self.batch_size: return None, None states, actions, rewards, next_states, done = self.memory.sample(self.batch_size) states = tf.convert_to_tensor(states, dtype=tf.float32) actions = tf.convert_to_tensor(actions, dtype=tf.float32) rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) next_states = tf.convert_to_tensor(next_states, dtype=tf.float32) with tf.GradientTape() as tape: target_actions = self.target_actor(next_states) critic_value_ = tf.squeeze(self.target_critic(next_states, target_actions), 1) critic_value = tf.squeeze(self.critic(states, actions), 1) target = rewards + self.gamma * critic_value_ * (1 - done) critic_loss = tf.keras.losses.MSE(target, critic_value) critic_gradient = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients( zip(critic_gradient, self.critic.trainable_variables) ) with tf.GradientTape() as tape: new_policy_actions = self.actor(states) actor_loss = -self.critic(states, new_policy_actions) actor_loss = tf.math.reduce_mean(actor_loss) actor_gradient = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_gradient, self.actor.trainable_variables) ) self.update_weights()
class PPOAgent(): def __init__(self, rows, columns, num_actions, l_rate=1e-4, gamma=0.99, lam=0.95, policy_kl_range=0.0008, policy_params=20, value_clip=1.0, loss_coefficient=1.0, entropy_coefficient=0.05): self.rows = rows self.columns = columns self.num_actions = num_actions self.actor = Actor(self.num_actions) self.critic = Critic() self.actor_old = Actor(self.num_actions) self.critic_old = Critic() self.optimizer = tf.keras.optimizers.Adam(l_rate) self.gamma = gamma self.lam = lam self.policy_kl_range = policy_kl_range self.policy_params = policy_params self.value_clip = value_clip self.loss_coefficient = loss_coefficient self.entropy_coefficient = entropy_coefficient @tf.function def fit(self, states, actions, rewards, next_states, dones): with tf.GradientTape() as tape: action_probabilities, values = self.actor(states), self.critic( states) old_action_probabilities, old_values = self.actor_old( states), self.critic_old(states) next_values = self.critic(next_states) loss = self._get_loss(action_probabilities, values, old_action_probabilities, old_values, next_values, actions, rewards, dones) grads = tape.gradient( loss, self.actor.trainable_variables + self.critic.trainable_variables) self.optimizer.apply_gradients( zip( grads, self.actor.trainable_variables + self.critic.trainable_variables)) def select_action(self, state, training=False): state_in = tf.expand_dims(state, axis=0) probabilities = self.actor(state_in) if training: distribution = tfp.distributions.Categorical(probs=probabilities) action = distribution.sample() action = int(action[0]) else: action = tf.argmax(probabilities[0]).numpy() return action def get_model(self): return self.actor def update_networks(self): self.actor_old.set_weights(self.actor.get_weights()) self.critic_old.set_weights(self.critic.get_weights()) def save_model_weights(self, actor_filename, critic_filename): self.actor.save_weights(actor_filename) self.critic.save_weights(critic_filename) def load_model_weights(self, actor_filename, critic_filename=None): self.actor(np.zeros((1, self.rows, self.columns, 1))) self.actor.load_weights(actor_filename) self.actor_old(np.zeros((1, self.rows, self.columns, 1))) self.actor_old.load_weights(actor_filename) if critic_filename is not None: self.critic(np.zeros((1, self.rows, self.columns, 1))) self.critic.load_weights(critic_filename) self.critic_old(np.zeros((1, self.rows, self.columns, 1))) self.critic_old.load_weights(critic_filename) def save_optimizer_weights(self, filename): np.save(filename, self.optimizer.get_weights()) def load_optimizer_weights(self, filename): optimizer_weights = np.load(filename, allow_pickle=True) model_weights = self.actor.trainable_weights + self.critic.trainable_variables zero_grads = [tf.zeros_like(w) for w in model_weights] self.optimizer.apply_gradients(zip(zero_grads, model_weights)) self.optimizer.set_weights(optimizer_weights) def _get_loss(self, action_probabilities, values, old_action_probabilities, old_values, next_values, actions, rewards, dones): old_values = tf.stop_gradient(old_values) advantages = self._generalized_advantages_estimation( values, rewards, next_values, dones) returns = tf.stop_gradient(advantages + values) advantages = \ tf.stop_gradient((advantages - tf.math.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-7)) log_probabilities = self._log_probabilities(action_probabilities, actions) old_log_probabilities = tf.stop_gradient( self._log_probabilities(old_action_probabilities, actions)) ratios = tf.math.exp(log_probabilities - old_log_probabilities) kl_divergence = self._kl_divergence(old_action_probabilities, action_probabilities) policy_gradient_loss = tf.where( tf.logical_and(kl_divergence >= self.policy_kl_range, ratios > 1), ratios * advantages - self.policy_params * kl_divergence, ratios * advantages) policy_gradient_loss = tf.math.reduce_mean(policy_gradient_loss) entropy = tf.math.reduce_mean(self._entropy(action_probabilities)) clipped_values = old_values + tf.clip_by_value( values - old_values, -self.value_clip, self.value_clip) values_losses = tf.math.square(returns - values) * 0.5 clipped_values_losses = tf.math.square(returns - clipped_values) * 0.5 critic_loss = tf.math.reduce_mean( tf.math.maximum(values_losses, clipped_values_losses)) loss = (critic_loss * self.loss_coefficient) - policy_gradient_loss - ( entropy * self.entropy_coefficient) return loss def _generalized_advantages_estimation(self, values, rewards, next_values, dones): gae = 0 advantages = [] delta = rewards + (1.0 - dones) * self.gamma * next_values - values for i in reversed(range(len(rewards))): gae = delta[i] + (1.0 - dones[i]) * self.gamma * self.lam * gae advantages.insert(0, gae) return tf.stack(advantages) def _log_probabilities(self, action_probabilities, actions): distribution = tfp.distributions.Categorical( probs=action_probabilities) return tf.expand_dims(distribution.log_prob(actions), axis=1) def _kl_divergence(self, probabilities1, probabilities2): distribution1 = tfp.distributions.Categorical(probs=probabilities1) distribution2 = tfp.distributions.Categorical(probs=probabilities2) return tf.expand_dims(tfp.distributions.kl_divergence( distribution1, distribution2), axis=1) def _entropy(self, probabilities): distribution = tfp.distributions.Categorical(probs=probabilities) return distribution.entropy()