class PPOAgent: GAMMA = 0.99 GAE_LAMBDA = 0.95 CLIPRANGE = 0.2 OPT_ITER = 20 BATCH_SIZE = 2048 def __init__(self, env_id, action_space, trajectory_size=256, n_envs=1, max_timesteps=1500): self.env_id = env_id self.n_envs = n_envs self.trajectory_size = trajectory_size self.vecenv = VecEnv(env_id=self.env_id, n_envs=self.n_envs, max_timesteps=max_timesteps) self.policy = PolicyNetwork(action_space=action_space) self.old_policy = PolicyNetwork(action_space=action_space) self.critic = CriticNetwork() self.r_running_stats = util.RunningStats(shape=(action_space, )) self._init_network() def _init_network(self): env = gym.make(self.env_id) state = np.atleast_2d(env.reset()) self.policy(state) self.old_policy(state) def run(self, n_updates, logdir): self.summary_writer = tf.summary.create_file_writer(str(logdir)) history = {"steps": [], "scores": []} states = self.vecenv.reset() hiscore = None for epoch in range(n_updates): for _ in range(self.trajectory_size): actions = self.policy.sample_action(states) next_states = self.vecenv.step(actions) states = next_states trajectories = self.vecenv.get_trajectories() for trajectory in trajectories: self.r_running_stats.update(trajectory["r"]) trajectories = self.compute_advantage(trajectories) states, actions, advantages, vtargs = self.create_minibatch( trajectories) vloss = self.update_critic(states, vtargs) self.update_policy(states, actions, advantages) global_steps = (epoch + 1) * self.trajectory_size * self.n_envs train_scores = np.array([traj["r"].sum() for traj in trajectories]) if epoch % 1 == 0: test_scores, total_steps = self.play(n=1) test_scores, total_steps = np.array(test_scores), np.array( total_steps) history["steps"].append(global_steps) history["scores"].append(test_scores.mean()) ma_score = sum(history["scores"][-10:]) / 10 with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores.mean(), step=epoch) tf.summary.scalar("test_steps", total_steps.mean(), step=epoch) print( f"Epoch {epoch}, {global_steps//1000}K, {test_scores.mean()}" ) if epoch // 10 > 10 and (hiscore is None or ma_score > hiscore): self.save_model() hiscore = ma_score print("Model Saved") with self.summary_writer.as_default(): tf.summary.scalar("value_loss", vloss, step=epoch) tf.summary.scalar("train_score", train_scores.mean(), step=epoch) return history def compute_advantage(self, trajectories): """ Generalized Advantage Estimation (GAE, 2016) """ for trajectory in trajectories: trajectory["v_pred"] = self.critic(trajectory["s"]).numpy() trajectory["v_pred_next"] = self.critic(trajectory["s2"]).numpy() is_nonterminals = 1 - trajectory["done"] normed_rewards = (trajectory["r"] / (np.sqrt(self.r_running_stats.var) + 1e-4)) deltas = normed_rewards + self.GAMMA * is_nonterminals * trajectory[ "v_pred_next"] - trajectory["v_pred"] advantages = np.zeros_like(deltas, dtype=np.float32) lastgae = 0 for i in reversed(range(len(deltas))): lastgae = deltas[ i] + self.GAMMA * self.GAE_LAMBDA * is_nonterminals[ i] * lastgae advantages[i] = lastgae trajectory["advantage"] = advantages trajectory["R"] = advantages + trajectory["v_pred"] return trajectories def update_policy(self, states, actions, advantages): self.old_policy.set_weights(self.policy.get_weights()) indices = np.random.choice(range(states.shape[0]), (self.OPT_ITER, self.BATCH_SIZE)) for i in range(self.OPT_ITER): idx = indices[i] old_means, old_stdevs = self.old_policy(states[idx]) old_logprob = self.compute_logprob(old_means, old_stdevs, actions[idx]) with tf.GradientTape() as tape: new_means, new_stdevs = self.policy(states[idx]) new_logprob = self.compute_logprob(new_means, new_stdevs, actions[idx]) ratio = tf.exp(new_logprob - old_logprob) ratio_clipped = tf.clip_by_value(ratio, 1 - self.CLIPRANGE, 1 + self.CLIPRANGE) loss_unclipped = ratio * advantages[idx] loss_clipped = ratio_clipped * advantages[idx] loss = tf.minimum(loss_unclipped, loss_clipped) loss = -1 * tf.reduce_mean(loss) grads = tape.gradient(loss, self.policy.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 0.5) self.policy.optimizer.apply_gradients( zip(grads, self.policy.trainable_variables)) def update_critic(self, states, v_targs): losses = [] indices = np.random.choice(range(states.shape[0]), (self.OPT_ITER, self.BATCH_SIZE)) for i in range(self.OPT_ITER): idx = indices[i] old_vpred = self.critic(states[idx]) with tf.GradientTape() as tape: vpred = self.critic(states[idx]) vpred_clipped = old_vpred + tf.clip_by_value( vpred - old_vpred, -self.CLIPRANGE, self.CLIPRANGE) loss = tf.maximum(tf.square(v_targs[idx] - vpred), tf.square(v_targs[idx] - vpred_clipped)) loss = tf.reduce_mean(loss) grads = tape.gradient(loss, self.critic.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 0.5) self.critic.optimizer.apply_gradients( zip(grads, self.critic.trainable_variables)) losses.append(loss) return np.array(losses).mean() @tf.function def compute_logprob(self, means, stdevs, actions): """ガウス分布の確率密度関数よりlogp(x)を計算 logp(x) = -0.5 log(2π) - log(std) -0.5 * ((x - mean) / std )^2 """ logprob = -0.5 * np.log(2 * np.pi) logprob += -tf.math.log(stdevs) logprob += -0.5 * tf.square((actions - means) / stdevs) logprob = tf.reduce_sum(logprob, axis=1, keepdims=True) return logprob def create_minibatch(self, trajectories): states = np.vstack([traj["s"] for traj in trajectories]) actions = np.vstack([traj["a"] for traj in trajectories]) advantages = np.vstack([traj["advantage"] for traj in trajectories]) v_targs = np.vstack([traj["R"] for traj in trajectories]) return states, actions, advantages, v_targs def save_model(self): self.policy.save_weights("checkpoints/policy") self.critic.save_weights("checkpoints/critic") def load_model(self): self.policy.load_weights("checkpoints/policy") self.critic.load_weights("checkpoints/critic") def play(self, n=1, monitordir=None, verbose=False): if monitordir: env = wrappers.Monitor(gym.make(self.env_id), monitordir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_id) total_rewards = [] total_steps = [] for _ in range(n): state = env.reset() done = False total_reward = 0 steps = 0 while not done: steps += 1 action = self.policy.sample_action(state) next_state, reward, done, _ = env.step(action[0]) if verbose: mean, sd = self.policy(np.atleast_2d(state)) print(mean, sd) print(reward) total_reward += reward if done: break else: state = next_state total_rewards.append(total_reward) total_steps.append(steps) print() print(total_reward, steps) print() return total_rewards, total_steps
class TD3Agent: MAX_EXPERIENCES = 30000 MIN_EXPERIENCES = 300 ENV_ID = "Pendulum-v0" ACTION_SPACE = 1 MAX_ACTION = 2 OBSERVATION_SPACE = 3 CRITIC_UPDATE_PERIOD = 4 POLICY_UPDATE_PERIOD = 8 TAU = 0.02 GAMMA = 0.99 BATCH_SIZE = 64 NOISE_STDDEV = 0.2 def __init__(self): self.env = gym.make(self.ENV_ID) self.env.max_episode_steps = 3000 self.actor = ActorNetwork(action_space=self.ACTION_SPACE, max_action=self.MAX_ACTION) self.target_actor = ActorNetwork(action_space=self.ACTION_SPACE, max_action=self.MAX_ACTION) self.critic = CriticNetwork() self.target_critic = CriticNetwork() self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES) self.global_steps = 0 self.hiscore = None self._build_networks() def _build_networks(self): """パラメータの初期化 """ dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE) dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.actor.call(dummy_state) self.target_actor.call(dummy_state) self.target_actor.set_weights(self.actor.get_weights()) self.critic.call(dummy_state, dummy_action, training=False) self.target_critic.call(dummy_state, dummy_action, training=False) self.target_critic.set_weights(self.critic.get_weights()) def play(self, n_episodes): total_rewards = [] recent_scores = collections.deque(maxlen=10) for n in range(n_episodes): total_reward, localsteps = self.play_episode() total_rewards.append(total_reward) recent_scores.append(total_reward) recent_average_score = sum(recent_scores) / len(recent_scores) print(f"Episode {n}: {total_reward}") print(f"Local steps {localsteps}") print(f"Experiences {len(self.buffer)}") print(f"Global step {self.global_steps}") print(f"Noise stdev {self.NOISE_STDDEV}") print(f"recent average score {recent_average_score}") print() if (self.hiscore is None) or (recent_average_score > self.hiscore): self.hiscore = recent_average_score print(f"HISCORE Updated: {self.hiscore}") self.save_model() return total_rewards def play_episode(self): total_reward = 0 steps = 0 done = False state = self.env.reset() while not done: action = self.actor.sample_action(state, noise=self.NOISE_STDDEV) next_state, reward, done, _ = self.env.step(action) exp = Experience(state, action, reward, next_state, done) self.buffer.add_experience(exp) state = next_state total_reward += reward steps += 1 self.global_steps += 1 #: Delayed Policy update if self.global_steps % self.CRITIC_UPDATE_PERIOD == 0: if self.global_steps % self.POLICY_UPDATE_PERIOD == 0: self.update_network(self.BATCH_SIZE, update_policy=True) self.update_target_network() else: self.update_network(self.BATCH_SIZE) return total_reward, steps def update_network(self, batch_size, update_policy=False): if len(self.buffer) < self.MIN_EXPERIENCES: return (states, actions, rewards, next_states, dones) = self.buffer.get_minibatch(batch_size) clipped_noise = np.clip(np.random.normal(0, 0.2, self.ACTION_SPACE), -0.5, 0.5) next_actions = self.target_actor( next_states) + clipped_noise * self.MAX_ACTION q1, q2 = self.target_critic(next_states, next_actions) next_qvalues = [ min(q1, q2) for q1, q2 in zip(q1.numpy().flatten(), q2.numpy().flatten()) ] #: Compute taeget values and update CriticNetwork target_values = np.vstack([ reward + self.GAMMA * next_qvalue if not done else reward for reward, done, next_qvalue in zip(rewards, dones, next_qvalues) ]).astype(np.float32) #: Update Critic with tf.GradientTape() as tape: q1, q2 = self.critic(states, actions) loss1 = tf.reduce_mean(tf.square(target_values - q1)) loss2 = tf.reduce_mean(tf.square(target_values - q2)) loss = loss1 + loss2 variables = self.critic.trainable_variables gradients = tape.gradient(loss, variables) self.critic.optimizer.apply_gradients(zip(gradients, variables)) #: Delayed Update ActorNetwork if update_policy: with tf.GradientTape() as tape: q1, _ = self.critic(states, self.actor(states)) J = -1 * tf.reduce_mean(q1) variables = self.actor.trainable_variables gradients = tape.gradient(J, variables) self.actor.optimizer.apply_gradients(zip(gradients, variables)) def update_target_network(self): # soft-target update Actor target_actor_weights = self.target_actor.get_weights() actor_weights = self.actor.get_weights() assert len(target_actor_weights) == len(actor_weights) self.target_actor.set_weights((1 - self.TAU) * np.array(target_actor_weights) + (self.TAU) * np.array(actor_weights)) # soft-target update Critic target_critic_weights = self.target_critic.get_weights() critic_weights = self.critic.get_weights() assert len(target_critic_weights) == len(critic_weights) self.target_critic.set_weights((1 - self.TAU) * np.array(target_critic_weights) + (self.TAU) * np.array(critic_weights)) def save_model(self): self.actor.save_weights("checkpoints/actor") self.critic.save_weights("checkpoints/critic") def load_model(self): self.actor.load_weights("checkpoints/actor") self.target_actor.load_weights("checkpoints/actor") self.critic.load_weights("checkpoints/critic") self.target_critic.load_weights("checkpoints/critic") def test_play(self, n, monitordir, load_model=False): if load_model: self.load_model() if monitordir: env = wrappers.Monitor(gym.make(self.ENV_ID), monitordir, force=True, video_callable=(lambda ep: ep % 1 == 0)) else: env = gym.make(self.ENV_ID) for i in range(n): total_reward = 0 steps = 0 done = False state = env.reset() while not done: action = self.actor.sample_action(state, noise=False) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward steps += 1 print() print(f"Test Play {i}: {total_reward}") print(f"Steps:", steps) print()
class DDPGAgent: MAX_EXPERIENCES = 30000 MIN_EXPERIENCES = 300 ENV_ID = "Pendulum-v0" ACTION_SPACE = 1 OBSERVATION_SPACE = 3 UPDATE_PERIOD = 4 START_EPISODES = 20 TAU = 0.02 GAMMA = 0.99 BATCH_SIZE = 32 def __init__(self): self.env = gym.make(self.ENV_ID) self.env.max_episode_steps = 1000 self.actor_network = ActorNetwork(action_space=self.ACTION_SPACE) self.target_actor_network = ActorNetwork( action_space=self.ACTION_SPACE) self.critic_network = CriticNetwork() self.target_critic_network = CriticNetwork() self.stdev = 0.2 self.buffer = ReplayBuffer(max_experiences=self.MAX_EXPERIENCES) self.global_steps = 0 self.hiscore = None self._build_networks() def _build_networks(self): """パラメータの初期化 """ dummy_state = np.random.normal(0, 0.1, size=self.OBSERVATION_SPACE) dummy_state = (dummy_state[np.newaxis, ...]).astype(np.float32) dummy_action = np.random.normal(0, 0.1, size=self.ACTION_SPACE) dummy_action = (dummy_action[np.newaxis, ...]).astype(np.float32) self.actor_network.call(dummy_state) self.target_actor_network.call(dummy_state) self.target_actor_network.set_weights(self.actor_network.get_weights()) self.critic_network.call(dummy_state, dummy_action, training=False) self.target_critic_network.call(dummy_state, dummy_action, training=False) self.target_critic_network.set_weights( self.critic_network.get_weights()) def play(self, n_episodes): total_rewards = [] recent_scores = collections.deque(maxlen=10) for n in range(n_episodes): if n <= self.START_EPISODES: total_reward, localsteps = self.play_episode(random=True) else: total_reward, localsteps = self.play_episode() total_rewards.append(total_reward) recent_scores.append(total_reward) recent_average_score = sum(recent_scores) / len(recent_scores) print(f"Episode {n}: {total_reward}") print(f"Local steps {localsteps}") print(f"Experiences {len(self.buffer)}") print(f"Global step {self.global_steps}") print(f"Noise stdev {self.stdev}") print(f"recent average score {recent_average_score}") print() if (self.hiscore is None) or (recent_average_score > self.hiscore): self.hiscore = recent_average_score print(f"HISCORE Updated: {self.hiscore}") self.save_model() return total_rewards def play_episode(self, random=False): total_reward = 0 steps = 0 done = False state = self.env.reset() while not done: if random: action = np.random.uniform(-2, 2, size=self.ACTION_SPACE) else: action = self.actor_network.sample_action(state, noise=self.stdev) next_state, reward, done, _ = self.env.step(action) exp = Experience(state, action, reward, next_state, done) self.buffer.add_experience(exp) state = next_state total_reward += reward steps += 1 self.global_steps += 1 if self.global_steps % self.UPDATE_PERIOD == 0: self.update_network(self.BATCH_SIZE) self.update_target_network() return total_reward, steps def update_network(self, batch_size): if len(self.buffer) < self.MIN_EXPERIENCES: return (states, actions, rewards, next_states, dones) = self.buffer.get_minibatch(batch_size) next_actions = self.target_actor_network(next_states) next_qvalues = self.target_critic_network( next_states, next_actions).numpy().flatten() #: Compute taeget values and update CriticNetwork target_values = np.vstack([ reward + self.GAMMA * next_qvalue if not done else reward for reward, done, next_qvalue in zip(rewards, dones, next_qvalues) ]).astype(np.float32) with tf.GradientTape() as tape: qvalues = self.critic_network(states, actions) loss = tf.reduce_mean(tf.square(target_values - qvalues)) variables = self.critic_network.trainable_variables gradients = tape.gradient(loss, variables) self.critic_network.optimizer.apply_gradients(zip( gradients, variables)) #: Update ActorNetwork with tf.GradientTape() as tape: J = -1 * tf.reduce_mean( self.critic_network(states, self.actor_network(states))) variables = self.actor_network.trainable_variables gradients = tape.gradient(J, variables) self.actor_network.optimizer.apply_gradients(zip(gradients, variables)) def update_target_network(self): # soft-target update Actor target_actor_weights = self.target_actor_network.get_weights() actor_weights = self.actor_network.get_weights() assert len(target_actor_weights) == len(actor_weights) self.target_actor_network.set_weights( (1 - self.TAU) * np.array(target_actor_weights) + (self.TAU) * np.array(actor_weights)) # soft-target update Critic target_critic_weights = self.target_critic_network.get_weights() critic_weights = self.critic_network.get_weights() assert len(target_critic_weights) == len(critic_weights) self.target_critic_network.set_weights( (1 - self.TAU) * np.array(target_critic_weights) + (self.TAU) * np.array(critic_weights)) def save_model(self): self.actor_network.save_weights("checkpoints/actor") self.critic_network.save_weights("checkpoints/critic") def load_model(self): self.actor_network.load_weights("checkpoints/actor") self.target_actor_network.load_weights("checkpoints/actor") self.critic_network.load_weights("checkpoints/critic") self.target_critic_network.load_weights("checkpoints/critic") def test_play(self, n, monitordir, load_model=False): if load_model: self.load_model() if monitordir: env = wrappers.Monitor(gym.make(self.ENV_ID), monitordir, force=True, video_callable=(lambda ep: ep % 1 == 0)) else: env = gym.make(self.ENV_ID) for i in range(n): total_reward = 0 steps = 0 done = False state = env.reset() while not done: action = self.actor_network.sample_action(state, noise=False) next_state, reward, done, _ = env.step(action) state = next_state total_reward += reward steps += 1 print() print(f"Test Play {i}: {total_reward}") print(f"Steps:", steps) print()