def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} state_values_true = self.calc_actual_state_values( self.rollouts.rewards, self.rollouts.dones ) #(rewards, dones)#from storage: obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()); obs =state? # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) s = Variable(torch.FloatTensor(self.rollouts.obs)) action_probs, state_values_est, hiddens = self.model( s) #action_probs, state_values_est action_log_probs = action_probs.log() a = Variable(torch.LongTensor(self.rollouts.actions).view(-1, 1)) chosen_action_log_probs = action_log_probs.gather(1, a) # This is also the TD error advantages = state_values_true - state_values_est entropy = (action_probs * action_log_probs).sum(1).mean() action_loss = (chosen_action_log_probs * advantages).mean() value_loss = advantages.pow(2).mean() loss = value_loss + action_loss - 0.0001 * entropy #entropy_weight = 0.0001 # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) RolloutStorage.reset() ## return loss.item()
class AgentA2C: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = args.remotes self.seed = 42 self.max_steps = 1e9 self.grad_norm = 0.5 self.entropy_weight = 0.05 self.eps = np.finfo(np.float32).eps.item() ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 1000 self.save_freq = 1 self.save_dir = './ckpts/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = MultiEnv() self.envs.configure(remotes=self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") observation = self.envs.reset() self.obs_shape = np.transpose(observation[0], (2, 0, 1)).shape self.act_shape = args.action_space self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) if args.test_a2c: self.load_model('./ckpts/model_1239.pt') self.hidden = None self.init_game_setting() def _update(self): # R_t = reward_t + gamma * R_{t+1} with torch.no_grad(): next_value, _, _ = self.model(self.rollouts.obs[-1], self.rollouts.hiddens[-1], self.rollouts.masks[-1]) self.rollouts.returns[-1] = next_value.detach() for step in reversed(range(self.rollouts.rewards.size(0))): self.rollouts.returns[step] = self.rollouts.rewards[step] + \ (self.rollouts.returns[step + 1] * \ self.gamma * \ self.rollouts.masks[step + 1]) # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) values, action_probs, _ = self.model( self.rollouts.obs[:-1].view(-1, self.obs_shape[0], self.obs_shape[1], self.obs_shape[2]), self.rollouts.hiddens[0], self.rollouts.masks[:-1].view(-1, 1)) distribution = torch.distributions.Categorical(action_probs) log_probs = distribution.log_prob( self.rollouts.actions.flatten()).flatten() returns = self.rollouts.returns[:-1].flatten() values = values.flatten() value_loss = F.smooth_l1_loss(returns, values) advantages = returns - values action_loss = -(log_probs * advantages.detach()).mean() entropy = distribution.entropy().mean() loss = value_loss + action_loss + (-self.entropy_weight * entropy) # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical values, action_probs, hiddens = self.model(obs, hiddens, masks) actions = torch.distributions.Categorical(action_probs).sample() transformed_action = multiActionTransform(actions.cpu().numpy()) obs, rewards, dones, infos = self.envs.step(transformed_action) # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) obs = torch.from_numpy(obs).to(self.device).permute(0, 3, 1, 2) masks = torch.from_numpy(1 - dones).to(self.device) rewards = torch.from_numpy(rewards).to(self.device) penalty_rewards = (1 - masks) * -10 rewards = rewards + penalty_rewards.double() self.rollouts.insert(obs, hiddens, actions.unsqueeze(1), values, rewards.unsqueeze(1), masks.unsqueeze(1)) def train(self): print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~START TRAINING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) print( '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' ) running_reward = deque(maxlen=self.update_freq * 2) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device).permute( 0, 3, 1, 2) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) max_reward = 0.0 counter = 0 continual_crash = 0 while True: try: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print( 'Steps: %d/%d | Avg reward: %f | Max reward: %f' % (total_steps, self.max_steps, avg_reward, max_reward)) with open('a2c_log.txt', 'a') as fout: fout.write(str(avg_reward) + '\n') if total_steps % self.save_freq == 0: self.save_model('model_{}.pt'.format(counter), avg_reward) counter += 1 if avg_reward > max_reward: max_reward = avg_reward self.save_model('model_max_{}.pt'.format(counter), max_reward) counter += 1 if total_steps >= self.max_steps: break continual_crash = 0 except Exception as e: continual_crash += 1 if continual_crash >= 10: print( '============================================================================================================================================' ) print(e) print("Crashed 10 times -- stopping u suck") print( '============================================================================================================================================' ) raise e else: print( '#############################################################################################################################################' ) print(e) print("Env crash, making new env") print( '#############################################################################################################################################' ) time.sleep(60) self.envs = MultiEnv(resize=(250, 150)) self.envs.configure(remotes=self.n_processes) time.sleep(60) def save_model(self, filename, max_reward): if not os.path.isdir(self.save_dir): os.mkdir(self.save_dir) print('model saved: ' + filename + ' (' + str(max_reward) + ')') torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): if use_cuda: self.model = torch.load(path) else: self.model = torch.load(path, map_location='cpu') def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): with torch.no_grad(): observation = torch.from_numpy(observation).float().permute( 0, 3, 1, 2).to(self.device) _, action_prob, hidden = self.model( observation, self.hidden, torch.ones(1, 1).to(self.device)) self.hidden = hidden action = torch.distributions.Categorical(action_prob).sample() return action.cpu().numpy()
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 64 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = False # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:1" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) if args.test_mario: self.load_model(os.path.join('mario.pt')) print('finish model loading ...') self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} rewards = self.rollouts.rewards obs = self.rollouts.obs hiddens = self.rollouts.hiddens masks = self.rollouts.masks actions = self.rollouts.actions preds = self.rollouts.value_preds # 5 x 16 x 1 Vt = preds[:-1] Vt_1 = self.gamma * preds[1:] * masks[:-1] # 5 x 16 from torch.autograd import Variable Advantage = Variable((rewards - (Vt-Vt_1)), requires_grad=False) R = Advantage.squeeze(-1) # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) entropys = [] logP = [] Q_values = [] for idx, (ob, hidden, mask) in enumerate(zip(obs, hiddens, masks)): value, action_prob, _ = self.model(ob, hidden, mask) Q_values.append(value) if idx != obs.size(0)-1: m = Categorical(action_prob) logP.append(m.log_prob(actions[idx].squeeze(-1))) entropys.append(torch.mean(m.entropy())) logP = torch.stack(logP,0) action_loss = torch.mean(-R * logP) print(action_loss) Q_values = torch.stack(Q_values, 0) Qt = Q_values[:-1] Qt_1 = rewards + self.gamma * preds[1:] * masks[:-1] mse = torch.nn.MSELoss() value_loss = mse(Qt, Qt_1) print(value_loss) entropys = sum(entropys)/len(entropys) print(entropys) loss = value_loss + action_loss - entropys print(loss) # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): from torch.autograd import Variable from torch.distributions import Categorical import numpy as np with torch.no_grad(): # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical values, action_probs, hiddens = self.model(obs, hiddens, masks) m = Categorical(action_probs) actions = m.sample() obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) obs = Variable(torch.FloatTensor(np.float32(obs))) rewards = Variable(torch.FloatTensor(np.float32(rewards))) dones = Variable(torch.FloatTensor(np.float32(dones))).unsqueeze(1) masks = torch.ones(masks.shape) - dones self.rollouts.insert(obs, hiddens, actions.unsqueeze(-1), values, rewards.unsqueeze(-1), masks) def train(self): # logging import logging logging.basicConfig(filename="mario_reward.log", level=logging.INFO) print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): self._step( self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: logging.info("{},{}".format(total_steps, avg_reward)) print('Steps: %d/%d | Avg reward: %f'% (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path, map_location=torch.device('cpu')) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action from torch.autograd import Variable observation = Variable(torch.from_numpy(observation).float().unsqueeze(0)).to(self.device) value, action_prob, hidden = self.model(observation, observation, observation) m = Categorical(action_prob) action = torch.argmax(m.probs).data.tolist() return action
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} embed() # running_add = next_value[-1] actions, policies, values, returns, advantages = process_rollout(args, steps, cuda) for step in range(self.update_freq): # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) loss = actor_loss.mean() + 0.5 * critic_loss - self.entropy_weight * entropy.mean() # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical values, action_probs, hiddens = self.model(obs, hiddens, masks) m = Categorical(action_probs) actions = m.sample() obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) obs = torch.Tensor(obs) rewards = torch.Tensor(rewards) masks = torch.Tensor(1-dones) self.rollouts.insert(obs, hiddens, actions.unsqueeze(1), values, rewards.unsqueeze(1), masks.unsqueeze(1)) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): self._step( self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f'% (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action embed() # state = torch.from_numpy(state).float().unsqueeze(0) # state = state.cuda() if use_cuda else state # self.model(state) # probs = self.model(state) # m = Categorical(probs) # action = m.sample() # self.model.saved_log_probs.append(m.log_prob(action)) # self.saved_actions.append(m.log_prob(action)) return action.item() return action
class AgentMario: #actor agent def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.99 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = False # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py if args.test_mario: self.load_model('./checkpoints/model.pt') self.display_freq = 4000 self.save_freq = 10000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n #print(self.obs_shape) #(4, 84, 84) #print(self.act_shape) #12 self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns #print(self.rollouts.obs.size()) #torch.Size([6, 16, 4, 84, 84]) obs_shape = self.rollouts.obs.size()[2:] #print(obs_shape) #torch.Size([4, 84, 84]) #print(self.rollouts.actions.size()) #torch.Size([5, 16, 1]) action_shape = self.rollouts.actions.size()[-1] #print(action_shape) #1 num_steps, num_processes, _ = self.rollouts.rewards.size() #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/algo/a2c_acktr.py line 38-43 #input() # R_t = reward_t + gamma * R_{t+1} discounted_return = torch.zeros(self.update_freq, self.n_processes, 1).to(self.device) #print(self.rollouts.rewards) for t in range(self.update_freq - 1, -1, -1): discounted_return[t] = self.rollouts.rewards[ t] + self.gamma * self.rollouts.value_preds[t + 1] #print(t) #print(self.rollouts.masks[t]) #print(self.rollouts.obs[:-1]) # [:-1] means don't take the last element #print(self.rollouts.obs[:-1].shape)#torch.Size([5, 16, 4, 84, 84]) # print(self.rollouts.obs[:-1].view(-1, *obs_shape).shape)# torch.Size([80, 4, 84, 84]) n_steps*n_processes, 4, 84, 84 #print(self.rollouts.hiddens[0].shape)#torch.Size([16, 512]) #print(self.rollouts.hiddens[0].view(-1, self.model.hidden_size).shape) #torch.Size([16, 512]) #print(self.rollouts.masks[:-1].view(-1, 1).shape) #torch.Size([80, 1]) values, action_probs, hiddens = self.model( self.rollouts.obs[:-1].view(-1, *obs_shape), self.rollouts.hiddens[0].view(-1, self.model.hidden_size), self.rollouts.masks[:-1].view(-1, 1)) #print(values.shape) #torch.Size([5, 16, 1]) #print(action_probs.shape) #torch.Size([5, 16, 12]) #print(hiddens.shape) #torch.Size([16, 512]) values = values.view(num_steps, num_processes, 1) action_probs = action_probs.view(num_steps, num_processes, -1) #print(action_probs) #print(action_probs.gather(2 ,self.rollouts.actions)) #print(action_probs.gather(2 ,self.rollouts.actions).shape) #torch.Size([5, 16, 1]) #m=Categorical(action_probs) action_probs = action_probs.gather(2, self.rollouts.actions) #print(m) #print(self.rollouts.actions) #print(action_probs) action_log_probs = action_probs.log() #action_log_probs = m.log_prob(self.rollouts.actions.view(-1, action_shape)) #print(action_log_probs) #print(action_log_probs.shape) #torch.Size([5, 16, 1]) #input() #deal with self.rollouts.actions later! #=self.model(self.rollouts.obs) #print(self.rollouts.rewards.shape) #torch.Size([5, 16, 1]) #print(self.rollouts.value_preds.shape)#torch.Size([6, 16, 1]) advantages = discounted_return - values #not so sure, advantage= r_t+gamma* V(s_t+1) - V(s_t) ????? #print(advantages) #print(advantages.shape) #torch.Size([5, 16, 1]) #print(self.rollouts.action_log_probs.shape) #torch.Size([5, 16, 1]) #input() #self.gamma* # TODO: #value loss is the critic loss; action loss is the actor loss # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration #use output entropy as regularization for pi(s) # loss = value_loss + action_loss (- entropy_weight * entropy) #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 260-267 critic_loss = advantages.pow(2).mean() #print(critic_loss.grad) #print(critic_loss) #tensor(1.2946, device='cuda:0', grad_fn=<MeanBackward1>) #print(critic_loss.shape) #torch.Size([]) #https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/tree/master/a2c_ppo_acktr -->USEFUL actor_loss = -(advantages * action_log_probs).mean() #print(actor_loss.grad) #print(actor_loss) #tensor(1.1621, device='cuda:0', grad_fn=<NegBackward>) #print(actor_loss.shape) #torch.Size([]) #input() loss = actor_loss + critic_loss #print(loss) #tensor(2.4567, device='cuda:0', grad_fn=<AddBackward0>) #print(loss.shape) #input() # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): #_step is just 1 step with torch.no_grad(): #16 is n_processes, meaning 16 workers, means batch_size is 16(?) #print("obs.shape", obs.shape) #torch.Size([16, 4, 84, 84]) #print(hiddens.shape) #torch.Size([16, 512]) #print(masks.shape) #torch.Size([16, 1]) #self.model has 3 inputs #I think we should for loop 16 times to get the state of each worker #which is WRONG! #for i in range(self.n_processes): values, action_probs, hiddens = self.model(obs, hiddens, masks) #values : V(st) obs: st #print(values.shape) # #print(hiddens.shape) #print(action_probs) #torch.Size([1, 16, 12]) #print(action_probs.shape) #torch.Size([16, 12]) #action_probs means F.softmax(policy) m = Categorical(action_probs) #print(m) #Categorical(probs: torch.Size([16, 12])) actions = m.sample() #print(m.log_prob(actions).shape) #input() action_log_probs = m.log_prob(actions).unsqueeze(1) #print(m.log_prob(actions)) #print(m.log_prob(actions).shape) #torch.Size([1, 16]) #input() #print(actions)#tensor([[9, 4, 8, 6, 4, 3, 9, 3, 0, 3, 5, 5, 1, 0, 2, 5]], device='cuda:0') #print(actions.shape) #torch.Size([16]) actions = actions.squeeze(0) #print(actions.cpu().numpy()) #[ 0 0 1 4 4 2 8 8 0 4 7 7 6 11 9 3] #input() #if you don't use recurrent, you don't need hidden and masks #values, action_provs, hiddens =self.model(obs, hiddens, masks) #actions=self.make_actions(obs) # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 256-257 #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/main.py line 113~132 obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) #obs here is s_t+1 #the step you're calling here is in shmem_vec_env.py step_async #you are inputing 16 actions to 16 environments #print(dones) #[False False False False False False False False False False False False # False False False False] #print(1-dones) #[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] #print(infos) #input() #rewards : rt, truly obtain when taking actions at values = values.squeeze(0) actions = actions.unsqueeze(1) obs = torch.from_numpy(obs) rewards = torch.from_numpy(rewards).unsqueeze(1) #print(rewards.shape) masks = torch.from_numpy(1 - dones).unsqueeze(1) # TODO: self.rollouts.insert(obs, hiddens, actions, action_log_probs, values, rewards, masks) # Store transitions (obs: s_t+1, hiddens, actions:a_t , values: V(s_t), rewards: r_t, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) #print(obs.shape) #torch.Size([16, 4, 84, 84]) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) #print(obs.shape) #torch.Size([16, 4, 84, 84]) #print(self.rollouts.obs.shape) #torch.Size([6, 16, 4, 84, 84]) # 6 is n_steps+1 --> see ../a2c/storage.py while True: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): #print(r) #print(m) if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() #update here total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: self.save_model('model.pt') if total_steps >= self.max_steps: break def save_model(self, filename): print("Save the model to ", self.save_dir) torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): print("Load the model from ", path) self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action #self.load_model("./checkpoints/model.pt") #load the model somewhere else! #print(observation.shape) #(4, 84, 84) #print(observation) observation = torch.from_numpy(observation).to( self.device).unsqueeze(0) #when do we call this function??? -->../test/py line 41 will call this function #you also need to differentiate test=True and test=False #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 170 #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/evaluation.py line 20-31 eval_recurrent_hidden_states = torch.zeros(self.n_processes, self.model.hidden_size, device=self.device) eval_masks = torch.zeros(self.n_processes, 1, device=self.device) _, action_probs, _ = self.model(observation, eval_recurrent_hidden_states, eval_masks) #print(action_probs) #print(action_probs.shape) #torch.Size([1, 12]) #print(action_probs.max(1)[1]) #print(action_probs.max(1)[1].item()) action = action_probs.max(1)[1].item() #print(action) return action
class AgentMario: def __init__(self, env, args): # Hyperparameters self.lr = 7e-4 self.gamma = 0.9 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 6e6 self.grad_norm = 0.5 self.entropy_weight = 0.05 if args.test_mario: self.load_model('./checkpoints/model.pt') ####################### NOTE: You need to implement self.recurrent = True # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 100000 self.save_dir = './checkpoints/' torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent).to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def _update(self): # TODO: Compute returns # R_t = reward_t + gamma * R_{t+1} for step in reversed(range(self.rollouts.rewards.size(0))): self.rollouts.returns[step] = self.rollouts.returns[step+1] * \ self.gamma * self.rollouts.masks[step+1] + self.rollouts.rewards[step] # TODO: # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) obs_shape = self.rollouts.obs.size()[2:] action_shape = self.rollouts.actions.size()[-1] num_steps, num_processes, _ = self.rollouts.rewards.size() values, action_probs, hiddens = self.model( self.rollouts.obs[:-1].view(-1, *obs_shape), self.rollouts.hiddens[0].view(-1, 512), self.rollouts.masks[:-1].view(-1, 1)) m = Categorical(action_probs) log_probs = m.log_prob(self.rollouts.actions.view(-1)) entropys = m.entropy().mean() values = values.view(num_steps, num_processes, 1) log_probs = log_probs.view(num_steps, num_processes, 1) advantages = self.rollouts.returns[:-1] - values value_loss = advantages.pow(2).mean() action_loss = -(advantages.detach() * log_probs).mean() loss = (value_loss + action_loss) - (entropys * self.entropy_weight) # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() # TODO: # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return loss.item() def _step(self, obs, hiddens, masks): with torch.no_grad(): values, action_probs, hiddens = self.model(obs, hiddens, masks) m = Categorical(action_probs) actions = m.sample() # TODO: # Sample actions from the output distributions # HINT: you can use torch.distributions.Categorical obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) # TODO: # Store transitions (obs, hiddens, actions, values, rewards, masks) # You need to convert arrays to tensors first # HINT: masks = (1 - dones) masks = torch.FloatTensor([[0.0] if done else [1.0] for done in dones]) obs = torch.from_numpy(obs).to(self.device) rewards = torch.from_numpy(rewards).unsqueeze(1).to(self.device) actions = actions.unsqueeze(1) self.rollouts.insert(obs, hiddens, actions, values, rewards, masks) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) x_value = [] y_value = [] while True: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) x_value.append(total_steps) y_value.append(avg_reward) if total_steps % self.save_freq == 0: self.save_model('model.pt') # if avg_reward > 5000: # self.save_model('model.pt') # x_value.append(total_steps) # y_value.append(avg_reward) # break if total_steps >= self.max_steps: break self.save_curve(x_value, y_value, 'mario_curve') # def save_curve(self, x_values, y_values, title): # # tmp = {title: # { # 'x': x_values, # 'y': y_values # } # } # # if os.path.isfile('./mario.json'): # with open('mario.json', 'r') as f: # file = json.load(f) # file.update(tmp) # with open('mario.json', 'w') as f: # json.dump(file, f) # else: # with open('mario.json', 'w') as f: # json.dump(tmp, f) def save_model(self, filename): torch.save(self.model, os.path.join(self.save_dir, filename)) def load_model(self, path): self.model = torch.load(path) def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): # TODO: Use you model to choose an action if test: # self.load_model('./checkpoints/model.pt') with torch.no_grad(): obs = torch.from_numpy(observation).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) _, action_probs, self.rollouts.hiddens[0] = self.model( self.rollouts.obs[0], self.rollouts.hiddens[0], self.rollouts.masks[0]) m = Categorical(action_probs) action = m.sample().cpu().numpy() return action[0]
class AgentA2C: def __init__(self, env, args): self.use_gae = True self.use_standard = False # Hyperparameters self.lr = 7e-4 self.gamma = 0.90 self.tau = 0.95 self.hidden_size = 512 self.update_freq = 5 self.n_processes = 16 self.seed = 7122 self.max_steps = 1e7 self.grad_norm = 0.5 self.clip_param = 0.2 self.entropy_weight = 0.05 ####################### NOTE: You need to implement self.recurrent = False # <- ActorCritic._forward_rnn() ####################### Please check a2c/actor_critic.py self.display_freq = 4000 self.save_freq = 20000 if args.test_a2c: if args.model_path == None: raise Exception('give --model_path') else: if args.folder_name == None: raise Exception('give --folder_name') self.model_dir = os.path.join('./model', args.folder_name) if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) self.plot = {'reward': []} torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) self.envs = env if self.envs == None: self.envs = make_vec_envs('SuperMarioBros-v0', self.seed, self.n_processes) self.device = torch.device("cuda:0" if use_cuda else "cpu") self.obs_shape = self.envs.observation_space.shape self.act_shape = self.envs.action_space.n self.rollouts = RolloutStorage(self.update_freq, self.n_processes, self.obs_shape, self.act_shape, self.hidden_size) self.model = ActorCritic(self.obs_shape, self.act_shape, self.hidden_size, self.recurrent) self.ppo_epochs = 4 self.ppo_batch_size = 5 if args.test_a2c: self.load_model(args.model_path) self.model = self.model.to(self.device) self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5) self.hidden = None self.init_game_setting() def ppo_iter(self, mini_batch_size, states, hiddens, masks, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], hiddens[rand_ids, :], masks[ rand_ids, :], actions[rand_ids, :], log_probs[ rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def _update(self): # R_t = reward_t + gamma * R_{t+1} with torch.no_grad(): Return = self.model.get_estimate_returns(self.rollouts.obs[-1], self.rollouts.hiddens[-1], self.rollouts.masks[-1]) self.rollouts.value_preds[-1].copy_(Return) self.rollouts.returns[-1].copy_(Return * self.rollouts.masks[-1]) if self.use_standard: self.rollouts.rewards = ( self.rollouts.rewards - self.rollouts.rewards.mean()) / self.rollouts.rewards.std() if self.use_gae: gae = 0 for r in reversed(range(len(self.rollouts.rewards))): delta = self.rollouts.rewards[r] \ + self.gamma * self.rollouts.value_preds[r+1] * self.rollouts.masks[r+1] \ - self.rollouts.value_preds[r] gae = delta + self.gamma * self.tau * self.rollouts.masks[ r + 1] * gae Return = gae + self.rollouts.value_preds[r] self.rollouts.returns[r].copy_(Return) else: for r in reversed(range(len(self.rollouts.rewards))): Return = self.rollouts.rewards[ r] + self.gamma * Return * self.rollouts.masks[r + 1] self.rollouts.returns[r].copy_(Return) # Compute actor critic loss (value_loss, action_loss) # OPTIONAL: You can also maxmize entropy to encourage exploration # loss = value_loss + action_loss (- entropy_weight * entropy) #action_probs = self.rollouts.action_probs.view(self.n_processes * self.update_freq, -1) #est_returns = self.rollouts.value_preds[:-1].view(self.n_processes * self.update_freq, -1) with torch.no_grad(): est_returns, log_probs, _ = self.model( self.rollouts.obs[:-1].view( self.n_processes * self.update_freq, *self.obs_shape), self.rollouts.hiddens[:-1].view( self.n_processes * self.update_freq, -1), self.rollouts.masks[:-1].view( self.n_processes * self.update_freq, -1), ) states = self.rollouts.obs[:-1] hiddens = self.rollouts.hiddens[:-1] masks = self.rollouts.masks[:-1] actions = self.rollouts.actions returns = self.rollouts.returns[:-1] est_returns = est_returns.view(self.update_freq, self.n_processes, -1) log_probs = log_probs.gather( 1, actions.view(self.n_processes * self.ppo_batch_size, -1)).view(self.update_freq, self.n_processes, -1) advantages = returns - est_returns all_loss = [] for _ in range(self.ppo_epochs): for state, hidden, mask, action, old_log_probs, return_, advantage in self.ppo_iter( self.ppo_batch_size, states, hiddens, masks, actions, log_probs, returns, advantages): action = action.view(self.n_processes * self.ppo_batch_size, -1) return_ = return_.view(self.n_processes * self.ppo_batch_size, -1) state = state.view(self.n_processes * self.ppo_batch_size, *self.obs_shape) hidden = hidden.view(self.n_processes * self.ppo_batch_size, -1) mask = mask.view(self.n_processes * self.ppo_batch_size, -1) old_log_probs = old_log_probs.view( self.n_processes * self.ppo_batch_size, -1) advantage = advantage.view( self.n_processes * self.ppo_batch_size, -1) value, new_log_probs, _ = self.model(state, hidden, mask) ratio = (new_log_probs.gather(1, action).log() - old_log_probs.log()).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * advantage # action loss (Policy) action_loss = -torch.min(surr1, surr2).mean() # value loss (DQN) value_loss = (return_ - value).pow(2).mean() # entropy entropy = (new_log_probs * new_log_probs.log()).sum(1).mean() # loss loss = 0.5 * value_loss + action_loss - self.entropy_weight * entropy # Update self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.model.parameters(), self.grad_norm) self.optimizer.step() all_loss.append(loss.item()) # Clear rollouts after update (RolloutStorage.reset()) self.rollouts.reset() return sum(all_loss) / len(all_loss) def _step(self, obs, hiddens, masks): with torch.no_grad(): values, action_probs, hiddens = self.model(obs, hiddens, masks) actions = Categorical(action_probs.detach()).sample() # Sample actions from the output distributions obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()) obs = torch.from_numpy(obs) rewards = torch.from_numpy(rewards).unsqueeze(1) masks = torch.from_numpy(1 - (dones)).unsqueeze(1) actions = actions.unsqueeze(1) self.rollouts.insert( obs, #next hiddens, #next actions, #now action_probs, #now values, #now rewards, #now masks) #next # Store transitions (obs, hiddens, actions, values, rewards, masks) def train(self): print('Start training') running_reward = deque(maxlen=10) episode_rewards = torch.zeros(self.n_processes, 1).to(self.device) total_steps = 0 best_reward = 0 # Store first observation obs = torch.from_numpy(self.envs.reset()).to(self.device) self.rollouts.obs[0].copy_(obs) self.rollouts.to(self.device) while True: # Update once every n-steps for step in range(self.update_freq): self._step(self.rollouts.obs[step], self.rollouts.hiddens[step], self.rollouts.masks[step]) # Calculate episode rewards episode_rewards += self.rollouts.rewards[step] for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]): if m == 0: running_reward.append(r.item()) episode_rewards *= self.rollouts.masks[step + 1] loss = self._update() total_steps += self.update_freq * self.n_processes # Log & save model if len(running_reward) == 0: avg_reward = 0 else: avg_reward = sum(running_reward) / len(running_reward) self.plot['reward'].append(avg_reward) print('Steps: %d/%d | Avg reward: %f | Loss: %f' % (total_steps, self.max_steps, avg_reward, loss), end='\r') if total_steps % self.display_freq == 0: print('Steps: %d/%d | Avg reward: %f' % (total_steps, self.max_steps, avg_reward)) if total_steps % self.save_freq == 0: with open(os.path.join(self.model_dir, 'plot.json'), 'w') as f: json.dump(self.plot, f) #if int(avg_reward) > best_reward: best_reward = int(avg_reward) self.save_model( os.path.join( self.model_dir, 's{}_r{}_model.pt'.format(total_steps, best_reward))) if total_steps >= self.max_steps: break def save_model(self, path): torch.save( { 'model': self.model, 'optimizer': self.optimizer.state_dict() }, path) def load_model(self, path): print('Load model from', path) self.model = torch.load(path)['model'] def init_game_setting(self): if self.recurrent: self.hidden = torch.zeros(1, self.hidden_size).to(self.device) def make_action(self, observation, test=False): obs = torch.FloatTensor([observation]).to(self.device) #self.rollouts.obs[0].copy_(obs) #self.rollouts.to(self.device) with torch.no_grad(): action_probs, _ = self.model.get_action_probs(obs, None, None) action = action_probs.max(1)[1].item() return action