class PPO(): def __init__(self, state_dim, action_dim, lr, betas, gamma, K_epochs, eps_clip, device): self.lr = lr self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.device = device self.policy = ActorCritic(state_dim, action_dim).to(self.device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.polciy_old = ActorCritic(state_dim, action_dim).to(self.device) self.MseLoss = nn.MSELoss() def update(self, memory): # Monte Carlo estimate of state rewards rewards = [] discount_reward = 0 for reward in reversed(memory.rewards): discount_reward = reward + (self.gamma * discount_reward) rewards.insert(0, discount_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list in tensor old_states = torch.stack(memory.states).to(self.device).detach() old_actions = torch.stack(memory.actions).to(self.device).detach() old_logprobs = torch.stack(memory.logprobs).to(self.device).detach() # Optimize policy for K epochs for _ in range(self.K_epochs): # Evaluating old acions and values: logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta_old) ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss( state_values, rewards) - 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())
class PPO: def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas) self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) try: self.policy.load_state_dict( torch.load('./PPO_continuous_drone.pth', map_location=device)) self.policy_old.load_state_dict( torch.load('./PPO_continuous_old_drone.pth', map_location=device)) print('Saved models loaded') except: print('New models generated') pass self.MseLoss = nn.MSELoss() def select_action(self, state, memory): state = torch.FloatTensor(state.reshape(1, -1)).to(device) return self.policy_old.act(state, memory).cpu().data.numpy().flatten() def update(self, memory): # Monte Carlo estimate of rewards: rewards = [] discounted_reward = 0 for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) rewards.insert(0, discounted_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list to tensor old_states = torch.squeeze(torch.stack(memory.states).to(device), 1).detach() old_actions = torch.squeeze(torch.stack(memory.actions).to(device), 1).detach() old_logprobs = torch.squeeze(torch.stack(memory.logprobs), 1).to(device).detach() # Optimize policy for K epochs: for _ in range(self.K_epochs): # Evaluating old actions and values : logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards - state_values.detach() surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss( state_values, rewards) - 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())