def learn(self): episode_reward = reward_recorder() state = np.array(self.env.reset()) td_loss = 0 for timestep in range(self.args.total_timesteps): explore_eps = self.exploration_schedule.get_value(timestep) with torch.no_grad(): state_tensor = self._get_tensors(state) action_value = self.net(state_tensor) # select action action = select_actions(action_value, explore_eps) next_state, reward, done, _ = self.env.step(action) next_state = np.array(next_state) # append samples self.buffer.add(state, action, reward, next_state, float(done)) state = next_state # add the reward episode_reward.add_rewards(reward) if done: action = np.array(self.env.reset()) episode_reward.start_new_episode() # sample the samples from the replay buffer if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0: batch_samples = self.buffer.sample(self.args.batch_size) td_loss = self._update_network(batch_samples) if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0: self.target_net.load_state_dict(self.net.state_dict()) if done and episode_reward.num_episodes % self.args.display_interval == 0: print("[{}] Frames: {}, Episode: {}, Mean{:.3f}, Loss {:.3f}".format(datetime.now(), timestep, episode_reward.num_episodes,\ episode_reward.mean, td_loss)) torch.save(self.net.state_dict(), self.model_path + '/model.pt')
def learn(self): episode_reward = [0.0] obs = np.array(self.env.reset()) td_loss = 0 for timestep in range(self.args.total_timesteps): explore_eps = self.exploration_schedule.get_value(timestep) with torch.no_grad(): obs_tensor = self._get_tensors(obs) action_value = self.net(obs_tensor) # select actions action = select_actions(action_value, explore_eps) # excute actions obs_, reward, done, _ = self.env.step(action) obs_ = np.array(obs_) # tryint to append the samples self.buffer.add(obs, action, reward, obs_, float(done)) obs = obs_ # add the rewards episode_reward[-1] += reward if done: obs = np.array(self.env.reset()) episode_reward.append(0.0) if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0: # start to sample the samples from the replay buffer batch_samples = self.buffer.sample(self.args.batch_size) td_loss = self._update_network(batch_samples) if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0: # update the target network self.target_net.load_state_dict(self.net.state_dict()) if len(episode_reward[-101:-1]) == 0: mean_reward_per_100 = 0 else: mean_reward_per_100 = np.mean(episode_reward[-101:-1]) num_episode = len(episode_reward) - 1 if done and num_episode % self.args.display_interval == 0: print('[{}] Frames: {}, Episode: {}, Mean: {:.3f}, Loss: {:.3f}'.format(datetime.now(), timestep, num_episode, \ mean_reward_per_100, td_loss)) torch.save(self.net.state_dict(), self.model_path + '/model.pt')
def learn(self): if not self.args.no_sil: sil_model = sil_module(self.net, self.args, self.optimizer) num_updates = self.args.total_frames // (self.args.num_processes * self.args.nsteps) # get the reward to calculate other information episode_rewards = torch.zeros([self.args.num_processes, 1]) final_rewards = torch.zeros([self.args.num_processes, 1]) # start to update for update in range(num_updates): mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): input_tensor = self._get_tensors(self.obs) _, pi = self.net(input_tensor) # select actions actions = select_actions(pi) cpu_actions = actions.squeeze(1).cpu().numpy() # start to store the information mb_obs.append(np.copy(self.obs)) mb_actions.append(cpu_actions) mb_dones.append(self.dones) # step obs, rewards, dones, _ = self.envs.step(cpu_actions) # process rewards... raw_rewards = copy.deepcopy(rewards) rewards = np.sign(rewards) # start to store the rewards self.dones = dones if not self.args.no_sil: sil_model.step(input_tensor.detach().cpu().numpy(), cpu_actions, raw_rewards, dones) mb_rewards.append(rewards) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs raw_rewards = torch.from_numpy( np.expand_dims(np.stack(raw_rewards), 1)).float() episode_rewards += raw_rewards # get the masks masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in dones]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # update the obs mb_dones.append(self.dones) # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] with torch.no_grad(): input_tensor = self._get_tensors(self.obs) last_values, _ = self.net(input_tensor) # compute returns for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values.detach().cpu().numpy().squeeze())): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.args.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.args.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() # start to update network vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions) # start to update the sil_module if not self.args.no_sil: mean_adv, num_samples = sil_model.train_sil_model() if update % self.args.log_interval == 0: if not self.args.no_sil: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \ 'Ent: {:.2f}, Min: {}, Max:{}, BR:{}, E:{}, VS:{}, S:{}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), sil_model.get_best_reward(), \ sil_model.num_episodes(), num_samples, sil_model.num_steps())) else: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \ 'Ent: {:.2f}, Min: {}, Max:{}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max())) torch.save(self.net.state_dict(), self.model_path + 'model.pt')
def learn(self): num_updates = self.args.total_frames // (self.args.nsteps * self.args.num_workers) # get the reward to calculate other informations episode_rewards = torch.zeros([self.args.num_workers, 1]) final_rewards = torch.zeros([self.args.num_workers, 1]) reward_hist = [] policy_loss_hist = [] env_loss_hist = [] for update in range(num_updates): mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], [] if self.args.lr_decay: self._adjust_learning_rate(update, num_updates) for step in range(self.args.nsteps): with torch.no_grad(): # get tensors obs_tensor = self._get_tensors(self.obs) values, pis = self.net(obs_tensor) # select actions actions = select_actions(pis) # get the input actions input_actions = actions # start to store information mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_dones.append(self.dones) mb_values.append(values.detach().cpu().numpy().squeeze()) # start to excute the actions in the environment obs, rewards, dones, _ = self.envs.step(input_actions) # update dones self.dones = dones mb_rewards.append(rewards) # clear the observation for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs # process the rewards part -- display the rewards on the screen rewards = torch.tensor(np.expand_dims(np.stack(rewards), 1), dtype=torch.float32) episode_rewards += rewards masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in dones], dtype=torch.float32) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_values = np.asarray(mb_values, dtype=np.float32) # compute the last state value with torch.no_grad(): obs_tensor = self._get_tensors(self.obs) last_values, _ = self.net(obs_tensor) last_values = last_values.detach().cpu().numpy().squeeze() # start to compute advantages... mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.args.nsteps)): if t == self.args.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[ t] mb_advs[ t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values # after compute the returns, let's process the rollouts mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape) mb_actions = mb_actions.swapaxes(0, 1).flatten() mb_returns = mb_returns.swapaxes(0, 1).flatten() mb_advs = mb_advs.swapaxes(0, 1).flatten() # before update the network, the old network will try to load the weights self.old_net.load_state_dict(self.net.state_dict()) # start to update the network pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns, mb_advs) # env_loss, policy_loss = self._update_network_by_env_net(mb_obs, mb_actions, mb_rewards) # display the training information reward_hist.append(final_rewards.mean().detach().cpu().numpy()) policy_loss_hist.append(pl) # env_loss_hist.append(env_loss) if update % self.args.display_interval == 0: self.logger.info('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}' .format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \ final_rewards.mean().item(), final_rewards.min().item(), final_rewards.max().item())) # save the model torch.save(self.net.state_dict(), self.model_path + '/model.pt') return reward_hist, env_loss_hist, policy_loss_hist
def learn(self): num_updates = self.args.total_frames // (self.args.nsteps * self.args.num_workers) # get the reward to calculate other informations episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) for update in range(num_updates): mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], [] if self.args.lr_decay: self._adjust_learning_rate(update, num_updates) for step in range(self.args.nsteps): with torch.no_grad(): # get tensors obs_tensor = self._get_tensors(self.obs) values, pis = self.net(obs_tensor) # select actions actions = select_actions(pis, self.args.dist, self.args.env_type) if self.args.env_type == 'atari': input_actions = actions else: if self.args.dist == 'gauss': input_actions = actions.copy() elif self.args.dist == 'beta': input_actions = -1 + 2 * actions # start to store information mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_dones.append(self.dones) mb_values.append(values.detach().cpu().numpy().squeeze()) # start to excute the actions in the environment obs, rewards, dones, _ = self.envs.step(input_actions) # update dones if self.args.env_type == 'mujoco': dones = np.array([dones]) rewards = np.array([rewards]) self.dones = dones mb_rewards.append(rewards) # clear the observation for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 if self.args.env_type == 'mujoco': # reset the environment obs = self.envs.reset() self.obs = obs if self.args.env_type == 'atari' else np.expand_dims( self.running_state(obs), 0) # process the rewards part -- display the rewards on the screen episode_rewards += rewards masks = np.array([0.0 if done_ else 1.0 for done_ in dones], dtype=np.float32) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_values = np.asarray(mb_values, dtype=np.float32) if self.args.env_type == 'mujoco': mb_values = np.expand_dims(mb_values, 1) # compute the last state value with torch.no_grad(): obs_tensor = self._get_tensors(self.obs) last_values, _ = self.net(obs_tensor) last_values = last_values.detach().cpu().numpy().squeeze() # start to compute advantages... mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.args.nsteps)): if t == self.args.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[ t] mb_advs[ t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values # after compute the returns, let's process the rollouts mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape) if self.args.env_type == 'atari': mb_actions = mb_actions.swapaxes(0, 1).flatten() mb_returns = mb_returns.swapaxes(0, 1).flatten() mb_advs = mb_advs.swapaxes(0, 1).flatten() # before update the network, the old network will try to load the weights self.old_net.load_state_dict(self.net.state_dict()) # start to update the network pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns, mb_advs) # display the training information if update % self.args.display_interval == 0: print('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}, PL: {:.3f},'\ 'VL: {:.3f}, Ent: {:.3f}'.format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \ final_rewards.mean(), final_rewards.min(), final_rewards.max(), pl, vl, ent)) # save the model if self.args.env_type == 'atari': torch.save(self.net.state_dict(), self.model_path + '/model.pt') else: # for the mujoco, we also need to keep the running mean filter! torch.save([self.net.state_dict(), self.running_state], self.model_path + '/model.pt')
def learn(self): # configuration USER_SAVE_DATE = '3006' USER_SAVE_MODEL = 'mymodel.pt' CONTINUE_TRAINING = False # False for new training, True for improving the existing model num_of_iteration = 0 # paths date = USER_SAVE_DATE plot_path = self.model_path + '/' + date + '/plots/plot_' best_model_path = self.model_path + '/' + date + '/best/' all_model_path = self.model_path + '/' + date reward_path = self.model_path + '/' + date + '/rewards/' load_model = CONTINUE_TRAINING best_model = all_model_path + '/' + USER_SAVE_MODEL all_final_rewards = [] num_updates = 1000000 obs = self.running_state(self.env.reset()) final_reward = 0 episode_reward = 0 self.dones = False # Load the best model for continuing training if load_model: print("=> Loading checkpoint...") checkpoint = torch.load(best_model) self.start_episode = checkpoint['update'] self.net.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) self.running_state = checkpoint['running_state'] final_reward = checkpoint['reward'] all_final_rewards.append(final_reward) #print("=> loaded checkpoint (Episode: {}, reward: {})".format(checkpoint['update'], final_reward)) for update in range(self.start_episode, num_updates): mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): obs_tensor = self._get_tensors(obs) value, pi = self.net(obs_tensor) # select actions actions = select_actions(pi) # store informations mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_dones.append(self.dones) mb_values.append(value.detach().numpy().squeeze()) # start to execute actions in the environment obs_, reward, done, _ = self.env.step(actions) self.dones = done mb_rewards.append(reward) if done: obs_ = self.env.reset() obs = self.running_state(obs_) episode_reward += reward mask = 0.0 if done else 1.0 final_reward *= mask final_reward += (1 - mask) * episode_reward episode_reward *= mask # to process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_values = np.asarray(mb_values, dtype=np.float32) # compute the last state value with torch.no_grad(): obs_tensor = self._get_tensors(obs) last_value, _ = self.net(obs_tensor) last_value = last_value.detach().numpy().squeeze() # compute the advantages mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.args.nsteps)): if t == self.args.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_value else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[ t] mb_advs[ t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values # normalize the advantages mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-5) # before the update, make the old network has the parameter of the current network self.old_net.load_state_dict(self.net.state_dict()) # start to update the network policy_loss, value_loss = self._update_network( mb_obs, mb_actions, mb_returns, mb_advs) #torch.save([self.net.state_dict(), self.running_state], self.model_path + 'model.pt') print('Episode: {} / {}, Iteration: {}, Reward: {:.3f}'.format( update, num_updates, (update + 1) * self.args.nsteps, final_reward)) all_final_rewards.append(final_reward.item()) self.save_model_for_training(update, final_reward.item(), filepath=best_model_path + str(round(final_reward.item(), 2)) + '_' + str(update) + '.pt') torch.save([self.net.state_dict(), self.running_state], self.model_path + "/" + date + "/" + str(round(final_reward.item(), 2)) + str(update) + '_testing' + ".pt") if update % self.args.display_interval == 0: fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(all_final_rewards)), all_final_rewards) plt.ylabel('Reward') plt.xlabel('Episode #') plt.savefig(plot_path + str(update) + '.png') plt.plot() reward_df = pd.DataFrame(all_final_rewards) with open(reward_path + 'rewards.csv', 'a') as f: reward_df.to_csv(f, header=False)
def learn(self): log_data = {} num_updates = self.args.total_frames // (self.args.nsteps * self.args.num_workers) # get the reward to calculate other informations episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) delay_step = 0 delay_rewards = 0 for update in range(num_updates): mb_obs, mb_rewards_ex, mb_actions, mb_dones, mb_values_mix, mb_values_ex, mb_obs_ = [], [], [], [], [], [], [] if self.args.lr_decay: self._adjust_learning_rate(update, num_updates) for step in range(self.args.nsteps): with torch.no_grad(): # get tensors obs_tensor = self._get_tensors(self.obs) v_mix, pis = self.net(obs_tensor) # select actions actions = select_actions(pis, self.args.dist, self.args.env_type) actions_tensor = torch.tensor( actions, dtype=torch.float32, device='cuda' if self.args.cuda else 'cpu').unsqueeze(0) _, v_ex = self.intrinsic_net(obs_tensor) # try to predict the intrinsic reward if self.args.env_type == 'atari': input_actions = actions else: if self.args.dist == 'gauss': input_actions = actions.copy() elif self.args.dist == 'beta': input_actions = -1 + 2 * actions # start to store information mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_dones.append(self.dones) mb_values_mix.append(v_mix.detach().cpu().numpy().squeeze()) mb_values_ex.append(v_ex.detach().cpu().numpy().squeeze()) # start to excute the actions in the environment obs_, rewards, dones, _ = self.envs.step(input_actions) obs_ = np.expand_dims(self.running_state(obs_), 0) mb_obs_.append(np.copy(obs_)) delay_step += 1 delay_rewards += rewards if dones or delay_step == self.args.reward_delay_freq: rewards = delay_rewards delay_step, delay_rewards = 0, 0 else: rewards = 0 # update dones if self.args.env_type == 'mujoco': dones = np.array([dones]) rewards = np.array([rewards]) self.dones = dones mb_rewards_ex.append(rewards) # clear the observation self.obs = obs_ for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 if self.args.env_type == 'mujoco': # reset the environment obs_ = self.envs.reset() self.obs = np.expand_dims(self.running_state(obs_), 0) #self.obs = obs if self.args.env_type == 'atari' else np.expand_dims(self.running_state(obs), 0) # process the rewards part -- display the rewards on the screen episode_rewards += rewards masks = np.array([0.0 if done_ else 1.0 for done_ in dones], dtype=np.float32) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # process the rollouts mb_obs_ = np.asarray(mb_obs_, dtype=np.float32) mb_obs_ = mb_obs_.swapaxes(0, 1).reshape(self.batch_ob_shape) # process the next mb_obs = np.asarray(mb_obs, dtype=np.float32) mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape) mb_rewards_ex = np.asarray(mb_rewards_ex, dtype=np.float32) mb_actions = np.asarray(mb_actions, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_values_mix = np.asarray(mb_values_mix, dtype=np.float32) mb_values_ex = np.asarray(mb_values_ex, dtype=np.float32) # calculate the r_in mb_rewards_in = self._computer_intrinsic_rewards(mb_obs, mb_obs_) if self.args.env_type == 'mujoco': mb_values_mix = np.expand_dims(mb_values_mix, 1) mb_values_ex = np.expand_dims(mb_values_ex, 1) # compute the last state value with torch.no_grad(): obs_tensor = self._get_tensors(self.obs) last_values_mix, _ = self.net(obs_tensor) last_values_mix = last_values_mix.detach().cpu().numpy( ).squeeze() # compute the the last values _, last_values_ex = self.intrinsic_net(obs_tensor) last_values_ex = last_values_ex.detach().cpu().numpy().squeeze( ) # compute some other useful information which will be used in the training mb_values_mix_next = np.zeros_like(mb_values_mix) mb_values_mix_next[:-1] = mb_values_mix[1:] * (1.0 - mb_dones[1:]) mb_values_mix_next[-1] = last_values_mix * (1 - self.dones) # get the tdmap td_mix = self.args.gamma * mb_values_mix_next - mb_values_mix # start to compute advantages... mb_advs_mix = np.zeros_like(mb_rewards_ex) mb_advs_ex = np.zeros_like(mb_rewards_ex) # calculate the reward_mix mb_rewards_mix = self.args.r_ext_coef * mb_rewards_ex + self.args.r_in_coef * mb_rewards_in lastgaelam_mix, lastgaelam_ex = 0, 0 for t in reversed(range(self.args.nsteps)): if t == self.args.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues_mix = last_values_mix nextvalues_ex = last_values_ex else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues_mix = mb_values_mix[t + 1] nextvalues_ex = mb_values_ex[t + 1] delta_mix = mb_rewards_mix[ t] + self.args.gamma * nextvalues_mix * nextnonterminal - mb_values_mix[ t] delta_ex = mb_rewards_ex[ t] + self.args.gamma * nextvalues_ex * nextnonterminal - mb_values_ex[ t] mb_advs_mix[ t] = lastgaelam_mix = delta_mix + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam_mix mb_advs_ex[ t] = lastgaelam_ex = delta_ex + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam_ex #mb_returns = mb_advs + mb_values mb_returns_mix = mb_advs_mix + mb_values_mix mb_returns_ex = mb_advs_ex + mb_values_ex # after compute the returns, let's process the rollouts if self.args.env_type == 'atari': mb_actions = mb_actions.swapaxes(0, 1).flatten() mb_returns_mix = mb_returns_mix.swapaxes(0, 1).flatten() mb_returns_ex = mb_returns_ex.swapaxes(0, 1).flatten() mb_advs_mix = mb_advs_mix.swapaxes(0, 1).flatten() mb_advs_ex = mb_advs_ex.swapaxes(0, 1).flatten() # flatten the rewards mb_rewards_ex = mb_rewards_ex.swapaxes(0, 1).flatten() mb_rewards_in = mb_rewards_in.swapaxes(0, 1).flatten() td_mix = td_mix.swapaxes(0, 1).flatten() mb_dones = mb_dones.swapaxes(0, 1).flatten() mb_values_mix = mb_values_mix.swapaxes(0, 1).flatten() # before update the network, the old network will try to load the weights self.old_net.load_state_dict(self.net.state_dict()) # start to update the network pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns_mix, mb_returns_ex, mb_advs_mix, mb_advs_ex, \ mb_rewards_in, mb_rewards_ex, td_mix, mb_dones, mb_values_mix, mb_obs_) # display the training information if update % self.args.display_interval == 0: print('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}, R_in: {:.3f}, R_ex: {:.3f}, PL: {:.3f},'\ 'VL: {:.3f}, Ent: {:.3f}'.format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \ final_rewards.mean(), final_rewards.min(), final_rewards.max(), np.mean(mb_rewards_in), np.mean(mb_rewards_ex), pl, vl, ent)) # save the model if self.args.env_type == 'atari': torch.save(self.net.state_dict(), self.model_path + '/model.pt') else: # for the mujoco, we also need to keep the running mean filter! torch.save([ self.net.state_dict(), self.running_state, self.intrinsic_net.state_dict() ], self.model_path + '/model.pt') # save log data log_data[update] = {'frames': (update + 1)*self.args.nsteps*self.args.num_workers, 'rewards_mean': final_rewards.mean(), \ 'rewards_in': np.mean(mb_rewards_in), 'rewards_ex': np.mean(mb_rewards_ex)} torch.save( log_data, '{}/{}.pt'.format(self.intrinsic_data_path, self.args.env_name))
import cv2 import numpy as np # update the current observation def get_tensors(obs): input_tensor = torch.tensor(np.transpose(obs, (0, 3, 1, 2)), dtype=torch.float32) return input_tensor if __name__ == "__main__": args = get_args() # create environment env = VecFrameStack(make_atari_env(args.env_name, 1, args.seed), 4) # get the model path model_path = args.save_dir + args.env_name + '/model.pt' network = Net(env.action_space.n) network.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) obs = env.reset() while True: env.render() # get the obs with torch.no_grad(): input_tensor = get_tensors(obs) _, pi = network(input_tensor) actions = select_actions(pi, True) obs, reward, done, _ = env.step([actions]) env.close()
def learn(self): num_updates = self.args.total_timesteps // self.args.nsteps obs = self.running_state(self.env.reset()) final_reward = 0 episode_reward = 0 self.dones = False for update in range(num_updates): mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): obs_tensor = self._get_tensors(obs) value, pi = self.net(obs_tensor) # select actions actions = select_actions(pi) # store informations mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_dones.append(self.dones) mb_values.append(value.detach().numpy().squeeze()) # start to execute actions in the environment obs_, reward, done, _ = self.env.step(actions) self.dones = done mb_rewards.append(reward) if done: obs_ = self.env.reset() obs = self.running_state(obs_) episode_reward += reward mask = 0.0 if done else 1.0 final_reward *= mask final_reward += (1 - mask) * episode_reward episode_reward *= mask # to process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_values = np.asarray(mb_values, dtype=np.float32) # compute the last state value with torch.no_grad(): obs_tensor = self._get_tensors(obs) last_value, _ = self.net(obs_tensor) last_value = last_value.detach().numpy().squeeze() # compute the advantages mb_returns = np.zeros_like(mb_rewards) mb_advs = np.zeros_like(mb_rewards) lastgaelam = 0 for t in reversed(range(self.args.nsteps)): if t == self.args.nsteps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_value else: nextnonterminal = 1.0 - mb_dones[t + 1] nextvalues = mb_values[t + 1] delta = mb_rewards[ t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[ t] mb_advs[ t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam mb_returns = mb_advs + mb_values # normalize the advantages mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-5) # before the update, make the old network has the parameter of the current network self.old_net.load_state_dict(self.net.state_dict()) # start to update the network policy_loss, value_loss = self._update_network( mb_obs, mb_actions, mb_returns, mb_advs) torch.save([self.net.state_dict(), self.running_state], self.model_path + 'model.pt') print('[{}] Update: {} / {}, Frames: {}, Reward: {:.3f}, VL: {:.3f}, PL: {:.3f}'.format(datetime.now(), update, \ num_updates, (update + 1)*self.args.nsteps, final_reward, value_loss, policy_loss))
def learn(self): num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps) # get the reward to calculate other information episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) # start to update for update in range(num_updates): if self.args.lr_decay: self._adjust_learning_rate(update, num_updates) mb_obs, mb_rewards_ex, mb_actions, mb_dones, mb_obs_, mb_v_ex, mb_v_mix = [], [], [], [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): input_tensor = self._get_tensors(self.obs) v_mix, pi = self.net(input_tensor) _, v_ex = self.intrinsic_net(input_tensor) # select actions actions = select_actions(pi) cpu_actions = actions.squeeze(1).cpu().numpy() # start to store the information mb_obs.append(np.copy(self.obs)) mb_actions.append(cpu_actions) mb_dones.append(self.dones) mb_v_ex.append(v_ex.detach().cpu().numpy().squeeze()) mb_v_mix.append(v_mix.detach().cpu().numpy().squeeze()) # step obs_, rewards, dones, _ = self.envs.step(cpu_actions) # store the observation next mb_obs_.append(np.copy(obs_)) # start to store the rewards self.dones = dones mb_rewards_ex.append(rewards) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n]*0 self.obs = obs_ episode_rewards += rewards # get the masks masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # update the obs mb_dones.append(self.dones) # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) """ mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0) """ # calculate the intrinsic rewards and make sure the dimensional is right mb_rewards_in = self._compute_intrinsic_rewards(mb_obs, mb_obs_) mb_rewards_in = mb_rewards_in.reshape((self.args.num_workers, self.args.nsteps)) # --- next mb_rewards_ex = np.asarray(mb_rewards_ex, dtype=np.float32).swapaxes(1, 0) # calculate the mix reward mb_rewards_mix = self.args.r_ext_coef * mb_rewards_ex + self.args.r_in_coef * mb_rewards_in mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_v_ex = np.asarray(mb_v_ex, dtype=np.float32).swapaxes(1, 0) mb_v_mix = np.asarray(mb_v_mix, dtype=np.float32).swapaxes(1, 0) # masks mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] # calculate the last value with torch.no_grad(): input_tensor = self._get_tensors(self.obs) last_values_mix, _ = self.net(input_tensor) last_values_mix = last_values_mix.detach().cpu().numpy().squeeze() # then last value ex _, last_values_ex = self.intrinsic_net(input_tensor) last_values_ex = last_values_ex.detach().cpu().numpy().squeeze() # get the returns ex and in mb_returns_ex, mb_returns_mix = np.zeros(mb_rewards_ex.shape), np.zeros(mb_rewards_in.shape) # compute returns for n, (rewards_ex, rewards_mix, dones, value_mix, value_ex) in enumerate(zip(mb_rewards_ex, mb_rewards_mix, mb_dones, last_values_mix, last_values_ex)): rewards_ex = rewards_ex.tolist() rewards_mix = rewards_mix.tolist() dones = dones.tolist() if dones[-1] == 0: returns_ex = discount_with_dones(rewards_ex+[value_ex], dones+[0], self.args.gamma)[:-1] returns_mix = discount_with_dones(rewards_mix+[value_mix], dones+[0], self.args.gamma)[:-1] else: returns_ex = discount_with_dones(rewards_ex, dones, self.args.gamma) returns_mix = discount_with_dones(rewards_mix, dones, self.args.gamma) mb_returns_ex[n] = returns_ex mb_returns_mix[n] = returns_mix # flatten stuffs mb_rewards_ex = mb_rewards_ex.flatten() mb_rewards_in = mb_rewards_in.flatten() mb_returns_ex = mb_returns_ex.flatten() mb_returns_mix = mb_returns_mix.flatten() mb_actions = mb_actions.flatten() mb_v_ex = mb_v_ex.flatten() mb_v_mix = mb_v_mix.flatten() mb_dones = mb_dones.flatten() mb_masks = mb_masks.flatten() # before the training calculate the matrix """ here - we calculate the coefficient matrix """ dis_v_mix_last = np.zeros([mb_obs.shape[0]], np.float32) coef_mat = np.zeros([mb_obs.shape[0], mb_obs.shape[0]], np.float32) for i in range(mb_obs.shape[0]): dis_v_mix_last[i] = self.args.gamma ** (self.args.nsteps - i % self.args.nsteps) * last_values_mix[i // self.args.nsteps] coef = 1.0 for j in range(i, mb_obs.shape[0]): if j > i and j % self.args.nsteps == 0: break coef_mat[i][j] = coef coef *= self.args.gamma if mb_dones[j]: dis_v_mix_last[i] = 0 break # start to update network vl, al, ent = self._update_network(mb_obs, mb_obs_, mb_masks, mb_actions, mb_rewards_ex, mb_returns_ex, mb_v_ex, mb_v_mix, \ dis_v_mix_last, coef_mat) if update % self.args.log_interval == 0: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}, R_in: {:.3f}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), np.mean(mb_rewards_in))) torch.save(self.net.state_dict(), self.model_path + 'model.pt')
def learn(self): num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps) # get the reward to calculate other information episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32) # start to update for update in range(num_updates): mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], [] for step in range(self.args.nsteps): with torch.no_grad(): input_tensor = self._get_tensors(self.obs) _, pi = self.net(input_tensor) # select actions actions = select_actions(pi) cpu_actions = actions.squeeze(1).cpu().numpy() # start to store the information mb_obs.append(np.copy(self.obs)) mb_actions.append(cpu_actions) mb_dones.append(self.dones) # step obs, rewards, dones, _ = self.envs.step(cpu_actions) # start to store the rewards self.dones = dones mb_rewards.append(rewards) for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 self.obs = obs episode_rewards += rewards # get the masks masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks # update the obs mb_dones.append(self.dones) # process the rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape( self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] # calculate the last value with torch.no_grad(): input_tensor = self._get_tensors(self.obs) last_values, _ = self.net(input_tensor) # compute returns for n, (rewards, dones, value) in enumerate( zip(mb_rewards, mb_dones, last_values.detach().cpu().numpy().squeeze())): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: rewards = discount_with_dones(rewards + [value], dones + [0], self.args.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.args.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() # start to update network vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions) if update % self.args.log_interval == 0: print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}'.format(\ datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\ final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max())) torch.save(self.net.state_dict(), self.model_path + 'model.pt')