def test(): env = gym.make(args.env_name) env.seed(10) torch.manual_seed(10) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] actor = Actor(state_size, action_size, args) actor.load_state_dict(torch.load(args.load_model)) for ep in range(args.test_iter): score = 0 done = False state = env.reset() state = np.reshape(state, [1, state_size]) while not done: mu, std = actor(torch.Tensor(state)) action = actor.get_action(mu, std) #random_action = env.action_space.sample() next_state, reward, done, info = env.step(mu.detach().numpy()) env.render() score += reward next_state = np.reshape(next_state, [1, state_size]) state = next_state if ep % args.log_interval == 0: print(ep, " ep | score ", score) env.close()
class TD3Agent: def __init__(self, env, render, config_info): self.env = env self._reset_env() self.render = render # Create run folder to store parameters, figures, and tensorboard logs self.path_runs = create_run_folder(config_info) # Extract training parameters from yaml config file param = load_training_parameters(config_info["config_param"]) self.train_param = param["training"] # Define device self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define state and action dimension spaces state_dim = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.max_action = float(env.action_space.high[0]) # Define models hidden_size = param["model"]["hidden_size"] self.critic = Critic(state_dim, self.num_actions, hidden_size).to(self.device) self.target_critic = Critic(state_dim, self.num_actions, hidden_size).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.policy = Actor(state_dim, self.num_actions, hidden_size, self.max_action).to(self.device) self.target_policy = Actor(state_dim, self.num_actions, hidden_size, self.max_action).to(self.device) self.target_policy.load_state_dict(self.policy.state_dict()) # Define loss criterion self.criterion = nn.MSELoss() # Define optimizers lr = float(param["optimizer"]["learning_rate"]) self.critic_opt = optim.Adam(self.critic.parameters(), lr=lr) self.policy_opt = optim.Adam(self.policy.parameters(), lr=lr) # Initialize replay buffer self.replay_buffer = ReplayBuffer(param["training"]["replay_max_size"]) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) # Useful variables self.batch_size = param["training"]["batch_size"] self.gamma = param["training"]["gamma"] self.tau = param["training"]["tau"] self.start_step = param["training"]["start_step"] self.max_timesteps = param["training"]["max_timesteps"] self.noise_policy = param["training"]["noise_policy"] self.noise_clip = param["training"]["noise_clip"] self.noise_explor = param["training"]["noise_explor"] self.update_freq_policy = param["training"]["update_freq_policy"] self.eval_freq = param["training"]["eval_freq"] self.num_eval_episodes = param["training"]["num_eval_episodes"] def _reset_env(self): # Reset the environment and initialize episode reward self.state, self.done = self.env.reset(), False self.episode_reward = 0.0 self.episode_step = 0 def train(self): # Main training loop total_timestep = 0 all_episode_rewards = [] all_mean_rewards = [] update = 0 # Create tensorboard writer writer = SummaryWriter(log_dir=self.path_runs, comment="-td3") for episode in itertools.count(1, 1): self._reset_env() while not self.done: # trick to improve exploration at the start of training if self.start_step > total_timestep: action = self.env.action_space.sample( ) # Sample random action else: policy_action = self.policy.get_action( self.state, self.device) add_noise_action = np.random.normal( loc=0, scale=self.noise_explor, size=self.num_actions) noisy_action = policy_action + add_noise_action action = np.clip(noisy_action, -self.max_action, self.max_action) # Fill the replay buffer up with transitions if (len(self.replay_buffer) > self.batch_size and total_timestep > self.start_step): batch = self.replay_buffer.sample_buffer(self.batch_size) # Update parameters of all the networks self.train_on_batch(batch, update, writer) if self.render: self.env.render() # Perform one step in the environment next_state, reward, self.done, _ = self.env.step(action) total_timestep += 1 self.episode_step += 1 self.episode_reward += reward # Create a tuple for the new transition new_transition = self.transition(self.state, action, reward, self.done, next_state) # Append transition to the replay buffer self.replay_buffer.store_transition(new_transition) self.state = next_state if total_timestep > self.max_timesteps: break mean_reward = np.mean(all_episode_rewards[-50:]) all_episode_rewards.append(self.episode_reward) all_mean_rewards.append(mean_reward) print("Episode n°{} ; total timestep [{}/{}] ; episode steps {} ; " "reward {} ; mean reward {}".format( episode, total_timestep, self.max_timesteps, self.episode_step, round(self.episode_reward, 2), round(mean_reward, 2), )) writer.add_scalar("reward", self.episode_reward, episode) writer.add_scalar("mean reward", mean_reward, episode) # Let's evaluate TD3 if episode % self.eval_freq == 0: avg_eval_return = self.eval() writer.add_scalar("eval/reward", avg_eval_return, episode) ### END # Save networks' weights path_critic = os.path.join(self.path_runs, "critic.pth") path_actor = os.path.join(self.path_runs, "actor.pth") torch.save(self.critic.state_dict(), path_critic) torch.save(self.policy.state_dict(), path_actor) # Plot reward self.plot_reward(all_episode_rewards, all_mean_rewards) # Close all writer.close() self.env.close() def train_on_batch(self, batch_samples, update, writer): # Unpack batch_size of transitions randomly drawn from the replay buffer update += 1 ( state_batch, action_batch, reward_batch, done_int_batch, next_state_batch, ) = batch_samples # Transform np arrays into tensors and send them to device state_batch = torch.tensor(state_batch).to(self.device) next_state_batch = torch.tensor(next_state_batch).to(self.device) action_batch = torch.tensor(action_batch).to(self.device) reward_batch = torch.tensor(reward_batch).unsqueeze(1).to(self.device) done_int_batch = torch.tensor(done_int_batch).unsqueeze(1).to( self.device) ### Update Q with torch.no_grad(): add_noise = torch.clamp( torch.randn_like(action_batch) * self.noise_policy, min=-self.noise_clip, max=self.noise_clip, ) next_action = torch.clamp( self.target_policy(next_state_batch) + add_noise, min=-self.max_action, max=self.max_action, ) target_q1, target_q2 = self.target_critic(next_state_batch, next_action) target_q = torch.min(target_q1, target_q2) target_q = reward_batch + self.gamma * (1 - done_int_batch) * target_q # Estimated state-action values q1, q2 = self.critic(state_batch, action_batch) q1_loss = self.criterion(q1, target_q) q2_loss = self.criterion(q2, target_q) # Losses and optimizers self.critic_opt.zero_grad() q1_loss.backward() self.critic_opt.step() self.critic_opt.zero_grad() q2_loss.backward() self.critic_opt.step() writer.add_scalar("loss/q1", q1_loss.item(), update) writer.add_scalar("loss/q2", q2_loss.item(), update) if update % self.update_freq_policy == 0: action = self.policy(state_batch) q1 = self.critic(state_batch, action) policy_loss = -q1.mean() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() writer.add_scalar("loss/policy", policy_loss.item(), update) # update target networks soft_update(self.target_critic, self.critic, self.tau) soft_update(self.target_policy, self.policy, self.tau) def eval(self): # Runs policy for X episodes and returns average reward # eval_env.seed(seed + 100) avg_reward = 0.0 for _ in range(self.num_eval_episodes): self._reset_env() while not self.done: action = self.policy.get_action(np.array(self.state)) nex_state, reward, self.done, _ = self.env.step(action) avg_reward += reward self.state = nex_state avg_reward /= self.num_eval_episodes print( f"Evaluation over {self.num_eval_episodes} episodes: {avg_reward:.3f}" ) return avg_reward def plot_reward(self, data, mean_data): plt.plot(data, label="reward") plt.plot(mean_data, label="mean reward") plt.xlabel("Episode") plt.ylabel("Reward") plt.title( f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment" ) plt.tight_layout() plt.legend() path_fig = os.path.join(self.path_runs, "figure.png") plt.savefig(path_fig) print(f"Figure saved to {path_fig}") plt.show()
def main(seed_num): env = gym.make("Pendulum-v0") #env = RedEnv() env.seed(seed_num) torch.manual_seed(seed_num) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] print() print("env(PPO) : ", args.env_name) print("state_size : ", state_size) print("action_size : ", action_size) print() actor = Actor(state_size, action_size, args) critic = Critic(state_size, args) actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) writer = SummaryWriter(args.logdir) recent_rewards = deque(maxlen=100) episodes = 0 for iter in range(args.max_iter_num): trajectories = deque() steps = 0 while steps < args.total_sample_size: done = False score = 0 episodes += 1 state = env.reset() # start distribution state = np.reshape(state, [1, state_size]) while not done: steps += 1 env.render() mu, std = actor(torch.Tensor(state)) action = actor.get_action(mu, std) next_state, reward, done, info = env.step(action) #print(reward) mask = 0 if done else 1 trajectories.append((state, action, reward, mask)) score += reward next_state = np.reshape(next_state, [1, state_size]) state = next_state if done: recent_rewards.append(score) if iter % args.log_interval: writer.add_scalar('log/ep_len', steps, episodes) actor.train(), critic.train() update(actor, critic, actor_optimizer, critic_optimizer, trajectories, state_size, action_size) writer.add_scalar('log/score', float(score), episodes) if iter % args.log_interval == 0: tmp = np.array(recent_rewards) mean_reward = tmp.mean() std_reward = tmp.std() min_reward = tmp.min() max_reward = tmp.max() print( '{} iter | {} episode | score_avg: {:.2f} | score_std: {:.2f} | score_min: {:.2f} | score_max: {:.2f}' .format(iter, episodes, mean_reward, std_reward, min_reward, max_reward)) if np.mean(recent_rewards) > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path_a = args.save_path + str(seed_num) + 'th_model_a.pth.tar' ckpt_path_c = args.save_path + str(seed_num) + 'th_model_c.pth.tar' torch.save(actor.state_dict(), ckpt_path_a) torch.save(critic.state_dict(), ckpt_path_c) print('Learning Terminated') break