示例#1
0
 def learn(self):
     episode_reward = reward_recorder()
     state = np.array(self.env.reset())
     td_loss = 0
     for timestep in range(self.args.total_timesteps):
         explore_eps = self.exploration_schedule.get_value(timestep)
         with torch.no_grad():
             state_tensor = self._get_tensors(state)
             action_value = self.net(state_tensor)
         # select action
         action = select_actions(action_value, explore_eps)
         next_state, reward, done, _ = self.env.step(action)
         next_state = np.array(next_state)
         # append samples
         self.buffer.add(state, action, reward, next_state, float(done))
         state = next_state
         # add the reward
         episode_reward.add_rewards(reward)
         if done:
             action = np.array(self.env.reset())
             episode_reward.start_new_episode()
         # sample the samples from the replay buffer
         if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0:
             batch_samples = self.buffer.sample(self.args.batch_size)
             td_loss = self._update_network(batch_samples)
         if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0:
             self.target_net.load_state_dict(self.net.state_dict())
         if done and episode_reward.num_episodes % self.args.display_interval == 0:
             print("[{}] Frames: {}, Episode: {}, Mean{:.3f}, Loss {:.3f}".format(datetime.now(), timestep, episode_reward.num_episodes,\
                                                                                  episode_reward.mean, td_loss))
             torch.save(self.net.state_dict(),
                        self.model_path + '/model.pt')
 def learn(self):
     episode_reward = [0.0]
     obs = np.array(self.env.reset())
     td_loss = 0
     for timestep in range(self.args.total_timesteps):
         explore_eps = self.exploration_schedule.get_value(timestep)
         with torch.no_grad():
             obs_tensor = self._get_tensors(obs)
             action_value = self.net(obs_tensor)
         # select actions
         action = select_actions(action_value, explore_eps)
         # excute actions
         obs_, reward, done, _ = self.env.step(action)
         obs_ = np.array(obs_)
         # tryint to append the samples
         self.buffer.add(obs, action, reward, obs_, float(done))
         obs = obs_
         # add the rewards
         episode_reward[-1] += reward
         if done:
             obs = np.array(self.env.reset())
             episode_reward.append(0.0)
         if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0:
             # start to sample the samples from the replay buffer
             batch_samples = self.buffer.sample(self.args.batch_size)
             td_loss = self._update_network(batch_samples)
         if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0:
             # update the target network
             self.target_net.load_state_dict(self.net.state_dict())
         if len(episode_reward[-101:-1]) == 0:
             mean_reward_per_100 = 0
         else:
             mean_reward_per_100 = np.mean(episode_reward[-101:-1])
         num_episode = len(episode_reward) - 1
         if done and num_episode % self.args.display_interval == 0:
             print('[{}] Frames: {}, Episode: {}, Mean: {:.3f}, Loss: {:.3f}'.format(datetime.now(), timestep, num_episode, \
                 mean_reward_per_100, td_loss))
             torch.save(self.net.state_dict(),
                        self.model_path + '/model.pt')
 def learn(self):
     if not self.args.no_sil:
         sil_model = sil_module(self.net, self.args, self.optimizer)
     num_updates = self.args.total_frames // (self.args.num_processes *
                                              self.args.nsteps)
     # get the reward to calculate other information
     episode_rewards = torch.zeros([self.args.num_processes, 1])
     final_rewards = torch.zeros([self.args.num_processes, 1])
     # start to update
     for update in range(num_updates):
         mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], []
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 input_tensor = self._get_tensors(self.obs)
                 _, pi = self.net(input_tensor)
             # select actions
             actions = select_actions(pi)
             cpu_actions = actions.squeeze(1).cpu().numpy()
             # start to store the information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(cpu_actions)
             mb_dones.append(self.dones)
             # step
             obs, rewards, dones, _ = self.envs.step(cpu_actions)
             # process rewards...
             raw_rewards = copy.deepcopy(rewards)
             rewards = np.sign(rewards)
             # start to store the rewards
             self.dones = dones
             if not self.args.no_sil:
                 sil_model.step(input_tensor.detach().cpu().numpy(),
                                cpu_actions, raw_rewards, dones)
             mb_rewards.append(rewards)
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n] * 0
             self.obs = obs
             raw_rewards = torch.from_numpy(
                 np.expand_dims(np.stack(raw_rewards), 1)).float()
             episode_rewards += raw_rewards
             # get the masks
             masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                        for done_ in dones])
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
             # update the obs
         mb_dones.append(self.dones)
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
             self.batch_ob_shape)
         mb_rewards = np.asarray(mb_rewards,
                                 dtype=np.float32).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
         with torch.no_grad():
             input_tensor = self._get_tensors(self.obs)
             last_values, _ = self.net(input_tensor)
         # compute returns
         for n, (rewards, dones, value) in enumerate(
                 zip(mb_rewards, mb_dones,
                     last_values.detach().cpu().numpy().squeeze())):
             rewards = rewards.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
                 rewards = discount_with_dones(rewards + [value],
                                               dones + [0],
                                               self.args.gamma)[:-1]
             else:
                 rewards = discount_with_dones(rewards, dones,
                                               self.args.gamma)
             mb_rewards[n] = rewards
         mb_rewards = mb_rewards.flatten()
         mb_actions = mb_actions.flatten()
         # start to update network
         vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions)
         # start to update the sil_module
         if not self.args.no_sil:
             mean_adv, num_samples = sil_model.train_sil_model()
         if update % self.args.log_interval == 0:
             if not self.args.no_sil:
                 print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \
                         'Ent: {:.2f}, Min: {}, Max:{}, BR:{}, E:{}, VS:{}, S:{}'.format(\
                         datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\
                         final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), sil_model.get_best_reward(), \
                         sil_model.num_episodes(), num_samples, sil_model.num_steps()))
             else:
                 print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.2f}, VL: {:.3f}, PL: {:.3f},' \
                         'Ent: {:.2f}, Min: {}, Max:{}'.format(\
                         datetime.now(), update, num_updates, (update+1)*(self.args.num_processes * self.args.nsteps),\
                         final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max()))
             torch.save(self.net.state_dict(), self.model_path + 'model.pt')
 def learn(self):
     num_updates = self.args.total_frames // (self.args.nsteps *
                                              self.args.num_workers)
     # get the reward to calculate other informations
     episode_rewards = torch.zeros([self.args.num_workers, 1])
     final_rewards = torch.zeros([self.args.num_workers, 1])
     reward_hist = []
     policy_loss_hist = []
     env_loss_hist = []
     for update in range(num_updates):
         mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], []
         if self.args.lr_decay:
             self._adjust_learning_rate(update, num_updates)
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 # get tensors
                 obs_tensor = self._get_tensors(self.obs)
                 values, pis = self.net(obs_tensor)
             # select actions
             actions = select_actions(pis)
             # get the input actions
             input_actions = actions
             # start to store information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(actions)
             mb_dones.append(self.dones)
             mb_values.append(values.detach().cpu().numpy().squeeze())
             # start to excute the actions in the environment
             obs, rewards, dones, _ = self.envs.step(input_actions)
             # update dones
             self.dones = dones
             mb_rewards.append(rewards)
             # clear the observation
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n] * 0
             self.obs = obs
             # process the rewards part -- display the rewards on the screen
             rewards = torch.tensor(np.expand_dims(np.stack(rewards), 1),
                                    dtype=torch.float32)
             episode_rewards += rewards
             masks = torch.tensor([[0.0] if done_ else [1.0]
                                   for done_ in dones],
                                  dtype=torch.float32)
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.float32)
         mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
         mb_actions = np.asarray(mb_actions, dtype=np.float32)
         mb_dones = np.asarray(mb_dones, dtype=np.bool)
         mb_values = np.asarray(mb_values, dtype=np.float32)
         # compute the last state value
         with torch.no_grad():
             obs_tensor = self._get_tensors(self.obs)
             last_values, _ = self.net(obs_tensor)
             last_values = last_values.detach().cpu().numpy().squeeze()
         # start to compute advantages...
         mb_returns = np.zeros_like(mb_rewards)
         mb_advs = np.zeros_like(mb_rewards)
         lastgaelam = 0
         for t in reversed(range(self.args.nsteps)):
             if t == self.args.nsteps - 1:
                 nextnonterminal = 1.0 - self.dones
                 nextvalues = last_values
             else:
                 nextnonterminal = 1.0 - mb_dones[t + 1]
                 nextvalues = mb_values[t + 1]
             delta = mb_rewards[
                 t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[
                     t]
             mb_advs[
                 t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam
         mb_returns = mb_advs + mb_values
         # after compute the returns, let's process the rollouts
         mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape)
         mb_actions = mb_actions.swapaxes(0, 1).flatten()
         mb_returns = mb_returns.swapaxes(0, 1).flatten()
         mb_advs = mb_advs.swapaxes(0, 1).flatten()
         # before update the network, the old network will try to load the weights
         self.old_net.load_state_dict(self.net.state_dict())
         # start to update the network
         pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns,
                                            mb_advs)
         # env_loss, policy_loss = self._update_network_by_env_net(mb_obs, mb_actions, mb_rewards)
         # display the training information
         reward_hist.append(final_rewards.mean().detach().cpu().numpy())
         policy_loss_hist.append(pl)
         # env_loss_hist.append(env_loss)
         if update % self.args.display_interval == 0:
             self.logger.info('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}'
                              .format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \
                             final_rewards.mean().item(), final_rewards.min().item(), final_rewards.max().item()))
             # save the model
             torch.save(self.net.state_dict(),
                        self.model_path + '/model.pt')
     return reward_hist, env_loss_hist, policy_loss_hist
示例#5
0
 def learn(self):
     num_updates = self.args.total_frames // (self.args.nsteps *
                                              self.args.num_workers)
     # get the reward to calculate other informations
     episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     for update in range(num_updates):
         mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], []
         if self.args.lr_decay:
             self._adjust_learning_rate(update, num_updates)
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 # get tensors
                 obs_tensor = self._get_tensors(self.obs)
                 values, pis = self.net(obs_tensor)
             # select actions
             actions = select_actions(pis, self.args.dist,
                                      self.args.env_type)
             if self.args.env_type == 'atari':
                 input_actions = actions
             else:
                 if self.args.dist == 'gauss':
                     input_actions = actions.copy()
                 elif self.args.dist == 'beta':
                     input_actions = -1 + 2 * actions
             # start to store information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(actions)
             mb_dones.append(self.dones)
             mb_values.append(values.detach().cpu().numpy().squeeze())
             # start to excute the actions in the environment
             obs, rewards, dones, _ = self.envs.step(input_actions)
             # update dones
             if self.args.env_type == 'mujoco':
                 dones = np.array([dones])
                 rewards = np.array([rewards])
             self.dones = dones
             mb_rewards.append(rewards)
             # clear the observation
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n] * 0
                     if self.args.env_type == 'mujoco':
                         # reset the environment
                         obs = self.envs.reset()
             self.obs = obs if self.args.env_type == 'atari' else np.expand_dims(
                 self.running_state(obs), 0)
             # process the rewards part -- display the rewards on the screen
             episode_rewards += rewards
             masks = np.array([0.0 if done_ else 1.0 for done_ in dones],
                              dtype=np.float32)
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.float32)
         mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
         mb_actions = np.asarray(mb_actions, dtype=np.float32)
         mb_dones = np.asarray(mb_dones, dtype=np.bool)
         mb_values = np.asarray(mb_values, dtype=np.float32)
         if self.args.env_type == 'mujoco':
             mb_values = np.expand_dims(mb_values, 1)
         # compute the last state value
         with torch.no_grad():
             obs_tensor = self._get_tensors(self.obs)
             last_values, _ = self.net(obs_tensor)
             last_values = last_values.detach().cpu().numpy().squeeze()
         # start to compute advantages...
         mb_returns = np.zeros_like(mb_rewards)
         mb_advs = np.zeros_like(mb_rewards)
         lastgaelam = 0
         for t in reversed(range(self.args.nsteps)):
             if t == self.args.nsteps - 1:
                 nextnonterminal = 1.0 - self.dones
                 nextvalues = last_values
             else:
                 nextnonterminal = 1.0 - mb_dones[t + 1]
                 nextvalues = mb_values[t + 1]
             delta = mb_rewards[
                 t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[
                     t]
             mb_advs[
                 t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam
         mb_returns = mb_advs + mb_values
         # after compute the returns, let's process the rollouts
         mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape)
         if self.args.env_type == 'atari':
             mb_actions = mb_actions.swapaxes(0, 1).flatten()
         mb_returns = mb_returns.swapaxes(0, 1).flatten()
         mb_advs = mb_advs.swapaxes(0, 1).flatten()
         # before update the network, the old network will try to load the weights
         self.old_net.load_state_dict(self.net.state_dict())
         # start to update the network
         pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns,
                                            mb_advs)
         # display the training information
         if update % self.args.display_interval == 0:
             print('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}, PL: {:.3f},'\
                 'VL: {:.3f}, Ent: {:.3f}'.format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \
                 final_rewards.mean(), final_rewards.min(), final_rewards.max(), pl, vl, ent))
             # save the model
             if self.args.env_type == 'atari':
                 torch.save(self.net.state_dict(),
                            self.model_path + '/model.pt')
             else:
                 # for the mujoco, we also need to keep the running mean filter!
                 torch.save([self.net.state_dict(), self.running_state],
                            self.model_path + '/model.pt')
示例#6
0
    def learn(self):

        # configuration
        USER_SAVE_DATE = '3006'
        USER_SAVE_MODEL = 'mymodel.pt'
        CONTINUE_TRAINING = False  # False for new training, True for improving the existing model
        num_of_iteration = 0

        # paths
        date = USER_SAVE_DATE
        plot_path = self.model_path + '/' + date + '/plots/plot_'
        best_model_path = self.model_path + '/' + date + '/best/'
        all_model_path = self.model_path + '/' + date
        reward_path = self.model_path + '/' + date + '/rewards/'

        load_model = CONTINUE_TRAINING
        best_model = all_model_path + '/' + USER_SAVE_MODEL
        all_final_rewards = []

        num_updates = 1000000
        obs = self.running_state(self.env.reset())

        final_reward = 0
        episode_reward = 0
        self.dones = False

        # Load the best model for continuing training
        if load_model:
            print("=> Loading checkpoint...")
            checkpoint = torch.load(best_model)
            self.start_episode = checkpoint['update']
            self.net.load_state_dict(checkpoint['state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            self.running_state = checkpoint['running_state']
            final_reward = checkpoint['reward']
            all_final_rewards.append(final_reward)
            #print("=> loaded checkpoint (Episode: {}, reward: {})".format(checkpoint['update'], final_reward))

        for update in range(self.start_episode, num_updates):
            mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], []
            for step in range(self.args.nsteps):
                with torch.no_grad():
                    obs_tensor = self._get_tensors(obs)
                    value, pi = self.net(obs_tensor)
                # select actions
                actions = select_actions(pi)
                # store informations
                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_dones.append(self.dones)
                mb_values.append(value.detach().numpy().squeeze())
                # start to execute actions in the environment
                obs_, reward, done, _ = self.env.step(actions)
                self.dones = done
                mb_rewards.append(reward)
                if done:
                    obs_ = self.env.reset()
                obs = self.running_state(obs_)
                episode_reward += reward
                mask = 0.0 if done else 1.0
                final_reward *= mask
                final_reward += (1 - mask) * episode_reward
                episode_reward *= mask
            # to process the rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32)
            mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
            mb_actions = np.asarray(mb_actions, dtype=np.float32)
            mb_dones = np.asarray(mb_dones, dtype=np.bool)
            mb_values = np.asarray(mb_values, dtype=np.float32)
            # compute the last state value
            with torch.no_grad():
                obs_tensor = self._get_tensors(obs)
                last_value, _ = self.net(obs_tensor)
                last_value = last_value.detach().numpy().squeeze()
            # compute the advantages
            mb_returns = np.zeros_like(mb_rewards)
            mb_advs = np.zeros_like(mb_rewards)
            lastgaelam = 0
            for t in reversed(range(self.args.nsteps)):
                if t == self.args.nsteps - 1:
                    nextnonterminal = 1.0 - self.dones
                    nextvalues = last_value
                else:
                    nextnonterminal = 1.0 - mb_dones[t + 1]
                    nextvalues = mb_values[t + 1]
                delta = mb_rewards[
                    t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[
                        t]
                mb_advs[
                    t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam
            mb_returns = mb_advs + mb_values
            # normalize the advantages
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-5)
            # before the update, make the old network has the parameter of the current network
            self.old_net.load_state_dict(self.net.state_dict())
            # start to update the network
            policy_loss, value_loss = self._update_network(
                mb_obs, mb_actions, mb_returns, mb_advs)
            #torch.save([self.net.state_dict(), self.running_state], self.model_path + 'model.pt')

            print('Episode: {} / {}, Iteration: {}, Reward: {:.3f}'.format(
                update, num_updates, (update + 1) * self.args.nsteps,
                final_reward))

            all_final_rewards.append(final_reward.item())
            self.save_model_for_training(update,
                                         final_reward.item(),
                                         filepath=best_model_path +
                                         str(round(final_reward.item(), 2)) +
                                         '_' + str(update) + '.pt')

            torch.save([self.net.state_dict(), self.running_state],
                       self.model_path + "/" + date + "/" +
                       str(round(final_reward.item(), 2)) + str(update) +
                       '_testing' + ".pt")

            if update % self.args.display_interval == 0:
                fig = plt.figure()
                ax = fig.add_subplot(111)
                plt.plot(np.arange(len(all_final_rewards)), all_final_rewards)
                plt.ylabel('Reward')
                plt.xlabel('Episode #')
                plt.savefig(plot_path + str(update) + '.png')
                plt.plot()
                reward_df = pd.DataFrame(all_final_rewards)
                with open(reward_path + 'rewards.csv', 'a') as f:
                    reward_df.to_csv(f, header=False)
示例#7
0
 def learn(self):
     log_data = {}
     num_updates = self.args.total_frames // (self.args.nsteps *
                                              self.args.num_workers)
     # get the reward to calculate other informations
     episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     delay_step = 0
     delay_rewards = 0
     for update in range(num_updates):
         mb_obs, mb_rewards_ex, mb_actions, mb_dones, mb_values_mix, mb_values_ex, mb_obs_ = [], [], [], [], [], [], []
         if self.args.lr_decay:
             self._adjust_learning_rate(update, num_updates)
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 # get tensors
                 obs_tensor = self._get_tensors(self.obs)
                 v_mix, pis = self.net(obs_tensor)
                 # select actions
                 actions = select_actions(pis, self.args.dist,
                                          self.args.env_type)
                 actions_tensor = torch.tensor(
                     actions,
                     dtype=torch.float32,
                     device='cuda'
                     if self.args.cuda else 'cpu').unsqueeze(0)
                 _, v_ex = self.intrinsic_net(obs_tensor)
             # try to predict the intrinsic reward
             if self.args.env_type == 'atari':
                 input_actions = actions
             else:
                 if self.args.dist == 'gauss':
                     input_actions = actions.copy()
                 elif self.args.dist == 'beta':
                     input_actions = -1 + 2 * actions
             # start to store information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(actions)
             mb_dones.append(self.dones)
             mb_values_mix.append(v_mix.detach().cpu().numpy().squeeze())
             mb_values_ex.append(v_ex.detach().cpu().numpy().squeeze())
             # start to excute the actions in the environment
             obs_, rewards, dones, _ = self.envs.step(input_actions)
             obs_ = np.expand_dims(self.running_state(obs_), 0)
             mb_obs_.append(np.copy(obs_))
             delay_step += 1
             delay_rewards += rewards
             if dones or delay_step == self.args.reward_delay_freq:
                 rewards = delay_rewards
                 delay_step, delay_rewards = 0, 0
             else:
                 rewards = 0
             # update dones
             if self.args.env_type == 'mujoco':
                 dones = np.array([dones])
                 rewards = np.array([rewards])
             self.dones = dones
             mb_rewards_ex.append(rewards)
             # clear the observation
             self.obs = obs_
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n] * 0
                     if self.args.env_type == 'mujoco':
                         # reset the environment
                         obs_ = self.envs.reset()
                         self.obs = np.expand_dims(self.running_state(obs_),
                                                   0)
             #self.obs = obs if self.args.env_type == 'atari' else np.expand_dims(self.running_state(obs), 0)
             # process the rewards part -- display the rewards on the screen
             episode_rewards += rewards
             masks = np.array([0.0 if done_ else 1.0 for done_ in dones],
                              dtype=np.float32)
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
         # process the rollouts
         mb_obs_ = np.asarray(mb_obs_, dtype=np.float32)
         mb_obs_ = mb_obs_.swapaxes(0, 1).reshape(self.batch_ob_shape)
         # process the next
         mb_obs = np.asarray(mb_obs, dtype=np.float32)
         mb_obs = mb_obs.swapaxes(0, 1).reshape(self.batch_ob_shape)
         mb_rewards_ex = np.asarray(mb_rewards_ex, dtype=np.float32)
         mb_actions = np.asarray(mb_actions, dtype=np.float32)
         mb_dones = np.asarray(mb_dones, dtype=np.bool)
         mb_values_mix = np.asarray(mb_values_mix, dtype=np.float32)
         mb_values_ex = np.asarray(mb_values_ex, dtype=np.float32)
         # calculate the r_in
         mb_rewards_in = self._computer_intrinsic_rewards(mb_obs, mb_obs_)
         if self.args.env_type == 'mujoco':
             mb_values_mix = np.expand_dims(mb_values_mix, 1)
             mb_values_ex = np.expand_dims(mb_values_ex, 1)
         # compute the last state value
         with torch.no_grad():
             obs_tensor = self._get_tensors(self.obs)
             last_values_mix, _ = self.net(obs_tensor)
             last_values_mix = last_values_mix.detach().cpu().numpy(
             ).squeeze()
             # compute the the last values
             _, last_values_ex = self.intrinsic_net(obs_tensor)
             last_values_ex = last_values_ex.detach().cpu().numpy().squeeze(
             )
         # compute some other useful information which will be used in the training
         mb_values_mix_next = np.zeros_like(mb_values_mix)
         mb_values_mix_next[:-1] = mb_values_mix[1:] * (1.0 - mb_dones[1:])
         mb_values_mix_next[-1] = last_values_mix * (1 - self.dones)
         # get the tdmap
         td_mix = self.args.gamma * mb_values_mix_next - mb_values_mix
         # start to compute advantages...
         mb_advs_mix = np.zeros_like(mb_rewards_ex)
         mb_advs_ex = np.zeros_like(mb_rewards_ex)
         # calculate the reward_mix
         mb_rewards_mix = self.args.r_ext_coef * mb_rewards_ex + self.args.r_in_coef * mb_rewards_in
         lastgaelam_mix, lastgaelam_ex = 0, 0
         for t in reversed(range(self.args.nsteps)):
             if t == self.args.nsteps - 1:
                 nextnonterminal = 1.0 - self.dones
                 nextvalues_mix = last_values_mix
                 nextvalues_ex = last_values_ex
             else:
                 nextnonterminal = 1.0 - mb_dones[t + 1]
                 nextvalues_mix = mb_values_mix[t + 1]
                 nextvalues_ex = mb_values_ex[t + 1]
             delta_mix = mb_rewards_mix[
                 t] + self.args.gamma * nextvalues_mix * nextnonterminal - mb_values_mix[
                     t]
             delta_ex = mb_rewards_ex[
                 t] + self.args.gamma * nextvalues_ex * nextnonterminal - mb_values_ex[
                     t]
             mb_advs_mix[
                 t] = lastgaelam_mix = delta_mix + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam_mix
             mb_advs_ex[
                 t] = lastgaelam_ex = delta_ex + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam_ex
         #mb_returns = mb_advs + mb_values
         mb_returns_mix = mb_advs_mix + mb_values_mix
         mb_returns_ex = mb_advs_ex + mb_values_ex
         # after compute the returns, let's process the rollouts
         if self.args.env_type == 'atari':
             mb_actions = mb_actions.swapaxes(0, 1).flatten()
         mb_returns_mix = mb_returns_mix.swapaxes(0, 1).flatten()
         mb_returns_ex = mb_returns_ex.swapaxes(0, 1).flatten()
         mb_advs_mix = mb_advs_mix.swapaxes(0, 1).flatten()
         mb_advs_ex = mb_advs_ex.swapaxes(0, 1).flatten()
         # flatten the rewards
         mb_rewards_ex = mb_rewards_ex.swapaxes(0, 1).flatten()
         mb_rewards_in = mb_rewards_in.swapaxes(0, 1).flatten()
         td_mix = td_mix.swapaxes(0, 1).flatten()
         mb_dones = mb_dones.swapaxes(0, 1).flatten()
         mb_values_mix = mb_values_mix.swapaxes(0, 1).flatten()
         # before update the network, the old network will try to load the weights
         self.old_net.load_state_dict(self.net.state_dict())
         # start to update the network
         pl, vl, ent = self._update_network(mb_obs, mb_actions, mb_returns_mix, mb_returns_ex, mb_advs_mix, mb_advs_ex, \
                 mb_rewards_in, mb_rewards_ex, td_mix, mb_dones, mb_values_mix, mb_obs_)
         # display the training information
         if update % self.args.display_interval == 0:
             print('[{}] Update: {} / {}, Frames: {}, Rewards: {:.3f}, Min: {:.3f}, Max: {:.3f}, R_in: {:.3f}, R_ex: {:.3f}, PL: {:.3f},'\
                 'VL: {:.3f}, Ent: {:.3f}'.format(datetime.now(), update, num_updates, (update + 1)*self.args.nsteps*self.args.num_workers, \
                 final_rewards.mean(), final_rewards.min(), final_rewards.max(), np.mean(mb_rewards_in), np.mean(mb_rewards_ex), pl, vl, ent))
             # save the model
             if self.args.env_type == 'atari':
                 torch.save(self.net.state_dict(),
                            self.model_path + '/model.pt')
             else:
                 # for the mujoco, we also need to keep the running mean filter!
                 torch.save([
                     self.net.state_dict(), self.running_state,
                     self.intrinsic_net.state_dict()
                 ], self.model_path + '/model.pt')
         # save log data
         log_data[update] = {'frames': (update + 1)*self.args.nsteps*self.args.num_workers, 'rewards_mean': final_rewards.mean(), \
                 'rewards_in': np.mean(mb_rewards_in), 'rewards_ex': np.mean(mb_rewards_ex)}
         torch.save(
             log_data, '{}/{}.pt'.format(self.intrinsic_data_path,
                                         self.args.env_name))
示例#8
0
import cv2
import numpy as np


# update the current observation
def get_tensors(obs):
    input_tensor = torch.tensor(np.transpose(obs, (0, 3, 1, 2)),
                                dtype=torch.float32)
    return input_tensor


if __name__ == "__main__":
    args = get_args()
    # create environment
    env = VecFrameStack(make_atari_env(args.env_name, 1, args.seed), 4)
    # get the model path
    model_path = args.save_dir + args.env_name + '/model.pt'
    network = Net(env.action_space.n)
    network.load_state_dict(
        torch.load(model_path, map_location=lambda storage, loc: storage))
    obs = env.reset()
    while True:
        env.render()
        # get the obs
        with torch.no_grad():
            input_tensor = get_tensors(obs)
            _, pi = network(input_tensor)
        actions = select_actions(pi, True)
        obs, reward, done, _ = env.step([actions])
    env.close()
示例#9
0
 def learn(self):
     num_updates = self.args.total_timesteps // self.args.nsteps
     obs = self.running_state(self.env.reset())
     final_reward = 0
     episode_reward = 0
     self.dones = False
     for update in range(num_updates):
         mb_obs, mb_rewards, mb_actions, mb_dones, mb_values = [], [], [], [], []
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 obs_tensor = self._get_tensors(obs)
                 value, pi = self.net(obs_tensor)
             # select actions
             actions = select_actions(pi)
             # store informations
             mb_obs.append(np.copy(obs))
             mb_actions.append(actions)
             mb_dones.append(self.dones)
             mb_values.append(value.detach().numpy().squeeze())
             # start to execute actions in the environment
             obs_, reward, done, _ = self.env.step(actions)
             self.dones = done
             mb_rewards.append(reward)
             if done:
                 obs_ = self.env.reset()
             obs = self.running_state(obs_)
             episode_reward += reward
             mask = 0.0 if done else 1.0
             final_reward *= mask
             final_reward += (1 - mask) * episode_reward
             episode_reward *= mask
         # to process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.float32)
         mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
         mb_actions = np.asarray(mb_actions, dtype=np.float32)
         mb_dones = np.asarray(mb_dones, dtype=np.bool)
         mb_values = np.asarray(mb_values, dtype=np.float32)
         # compute the last state value
         with torch.no_grad():
             obs_tensor = self._get_tensors(obs)
             last_value, _ = self.net(obs_tensor)
             last_value = last_value.detach().numpy().squeeze()
         # compute the advantages
         mb_returns = np.zeros_like(mb_rewards)
         mb_advs = np.zeros_like(mb_rewards)
         lastgaelam = 0
         for t in reversed(range(self.args.nsteps)):
             if t == self.args.nsteps - 1:
                 nextnonterminal = 1.0 - self.dones
                 nextvalues = last_value
             else:
                 nextnonterminal = 1.0 - mb_dones[t + 1]
                 nextvalues = mb_values[t + 1]
             delta = mb_rewards[
                 t] + self.args.gamma * nextvalues * nextnonterminal - mb_values[
                     t]
             mb_advs[
                 t] = lastgaelam = delta + self.args.gamma * self.args.tau * nextnonterminal * lastgaelam
         mb_returns = mb_advs + mb_values
         # normalize the advantages
         mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-5)
         # before the update, make the old network has the parameter of the current network
         self.old_net.load_state_dict(self.net.state_dict())
         # start to update the network
         policy_loss, value_loss = self._update_network(
             mb_obs, mb_actions, mb_returns, mb_advs)
         torch.save([self.net.state_dict(), self.running_state],
                    self.model_path + 'model.pt')
         print('[{}] Update: {} / {}, Frames: {}, Reward: {:.3f}, VL: {:.3f}, PL: {:.3f}'.format(datetime.now(), update, \
                 num_updates, (update + 1)*self.args.nsteps, final_reward, value_loss, policy_loss))
示例#10
0
 def learn(self):
     num_updates = self.args.total_frames // (self.args.num_workers * self.args.nsteps)
     # get the reward to calculate other information
     episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     # start to update
     for update in range(num_updates):
         if self.args.lr_decay:
             self._adjust_learning_rate(update, num_updates)
         mb_obs, mb_rewards_ex, mb_actions, mb_dones, mb_obs_, mb_v_ex, mb_v_mix = [], [], [], [], [], [], []
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 input_tensor = self._get_tensors(self.obs)
                 v_mix, pi = self.net(input_tensor)
                 _, v_ex = self.intrinsic_net(input_tensor)
             # select actions
             actions = select_actions(pi)
             cpu_actions = actions.squeeze(1).cpu().numpy()
             # start to store the information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(cpu_actions)
             mb_dones.append(self.dones)
             mb_v_ex.append(v_ex.detach().cpu().numpy().squeeze())
             mb_v_mix.append(v_mix.detach().cpu().numpy().squeeze())
             # step
             obs_, rewards, dones, _ = self.envs.step(cpu_actions)
             # store the observation next
             mb_obs_.append(np.copy(obs_))
             # start to store the rewards
             self.dones = dones
             mb_rewards_ex.append(rewards)
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n]*0
             self.obs = obs_
             episode_rewards += rewards
             # get the masks
             masks = np.array([0.0 if done else 1.0 for done in dones], dtype=np.float32)
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
             # update the obs
         mb_dones.append(self.dones)
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
         mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
         """
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
         mb_obs_ = np.asarray(mb_obs_, dtype=np.uint8).swapaxes(1, 0)
         """
         # calculate the intrinsic rewards and make sure the dimensional is right
         mb_rewards_in = self._compute_intrinsic_rewards(mb_obs, mb_obs_)
         mb_rewards_in = mb_rewards_in.reshape((self.args.num_workers, self.args.nsteps))
         # --- next
         mb_rewards_ex = np.asarray(mb_rewards_ex, dtype=np.float32).swapaxes(1, 0)
         # calculate the mix reward
         mb_rewards_mix = self.args.r_ext_coef * mb_rewards_ex + self.args.r_in_coef * mb_rewards_in
         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
         mb_v_ex = np.asarray(mb_v_ex, dtype=np.float32).swapaxes(1, 0)
         mb_v_mix = np.asarray(mb_v_mix, dtype=np.float32).swapaxes(1, 0)
         # masks
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
         # calculate the last value
         with torch.no_grad():
             input_tensor = self._get_tensors(self.obs)
             last_values_mix, _ = self.net(input_tensor)
             last_values_mix = last_values_mix.detach().cpu().numpy().squeeze()
             # then last value ex
             _, last_values_ex = self.intrinsic_net(input_tensor)
             last_values_ex = last_values_ex.detach().cpu().numpy().squeeze()
         # get the returns ex and in
         mb_returns_ex, mb_returns_mix = np.zeros(mb_rewards_ex.shape), np.zeros(mb_rewards_in.shape)
         # compute returns
         for n, (rewards_ex, rewards_mix, dones, value_mix, value_ex) in enumerate(zip(mb_rewards_ex, mb_rewards_mix, mb_dones, last_values_mix, last_values_ex)):
             rewards_ex = rewards_ex.tolist()
             rewards_mix = rewards_mix.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
                 returns_ex = discount_with_dones(rewards_ex+[value_ex], dones+[0], self.args.gamma)[:-1]
                 returns_mix = discount_with_dones(rewards_mix+[value_mix], dones+[0], self.args.gamma)[:-1]
             else:
                 returns_ex = discount_with_dones(rewards_ex, dones, self.args.gamma)
                 returns_mix = discount_with_dones(rewards_mix, dones, self.args.gamma)
             mb_returns_ex[n] = returns_ex
             mb_returns_mix[n] = returns_mix
         # flatten stuffs
         mb_rewards_ex = mb_rewards_ex.flatten()
         mb_rewards_in = mb_rewards_in.flatten()
         mb_returns_ex = mb_returns_ex.flatten()
         mb_returns_mix = mb_returns_mix.flatten()
         mb_actions = mb_actions.flatten()
         mb_v_ex = mb_v_ex.flatten()
         mb_v_mix = mb_v_mix.flatten()
         mb_dones = mb_dones.flatten()
         mb_masks = mb_masks.flatten()
         # before the training calculate the matrix
         """
         here - we calculate the coefficient matrix
         """
         dis_v_mix_last = np.zeros([mb_obs.shape[0]], np.float32)
         coef_mat = np.zeros([mb_obs.shape[0], mb_obs.shape[0]], np.float32)
         for i in range(mb_obs.shape[0]):
             dis_v_mix_last[i] = self.args.gamma ** (self.args.nsteps - i % self.args.nsteps) * last_values_mix[i // self.args.nsteps]
             coef = 1.0
             for j in range(i, mb_obs.shape[0]):
                 if j > i and j % self.args.nsteps == 0:
                     break
                 coef_mat[i][j] = coef
                 coef *= self.args.gamma
                 if mb_dones[j]:
                     dis_v_mix_last[i] = 0
                     break
         # start to update network
         vl, al, ent = self._update_network(mb_obs, mb_obs_, mb_masks, mb_actions, mb_rewards_ex, mb_returns_ex, mb_v_ex, mb_v_mix, \
                                             dis_v_mix_last, coef_mat)
         if update % self.args.log_interval == 0:
             print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}, R_in: {:.3f}'.format(\
                 datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\
                 final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max(), np.mean(mb_rewards_in)))
             torch.save(self.net.state_dict(), self.model_path + 'model.pt')
示例#11
0
 def learn(self):
     num_updates = self.args.total_frames // (self.args.num_workers *
                                              self.args.nsteps)
     # get the reward to calculate other information
     episode_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     final_rewards = np.zeros((self.args.num_workers, ), dtype=np.float32)
     # start to update
     for update in range(num_updates):
         mb_obs, mb_rewards, mb_actions, mb_dones = [], [], [], []
         for step in range(self.args.nsteps):
             with torch.no_grad():
                 input_tensor = self._get_tensors(self.obs)
                 _, pi = self.net(input_tensor)
             # select actions
             actions = select_actions(pi)
             cpu_actions = actions.squeeze(1).cpu().numpy()
             # start to store the information
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(cpu_actions)
             mb_dones.append(self.dones)
             # step
             obs, rewards, dones, _ = self.envs.step(cpu_actions)
             # start to store the rewards
             self.dones = dones
             mb_rewards.append(rewards)
             for n, done in enumerate(dones):
                 if done:
                     self.obs[n] = self.obs[n] * 0
             self.obs = obs
             episode_rewards += rewards
             # get the masks
             masks = np.array([0.0 if done else 1.0 for done in dones],
                              dtype=np.float32)
             final_rewards *= masks
             final_rewards += (1 - masks) * episode_rewards
             episode_rewards *= masks
             # update the obs
         mb_dones.append(self.dones)
         # process the rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(
             self.batch_ob_shape)
         mb_rewards = np.asarray(mb_rewards,
                                 dtype=np.float32).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
         # calculate the last value
         with torch.no_grad():
             input_tensor = self._get_tensors(self.obs)
             last_values, _ = self.net(input_tensor)
         # compute returns
         for n, (rewards, dones, value) in enumerate(
                 zip(mb_rewards, mb_dones,
                     last_values.detach().cpu().numpy().squeeze())):
             rewards = rewards.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
                 rewards = discount_with_dones(rewards + [value],
                                               dones + [0],
                                               self.args.gamma)[:-1]
             else:
                 rewards = discount_with_dones(rewards, dones,
                                               self.args.gamma)
             mb_rewards[n] = rewards
         mb_rewards = mb_rewards.flatten()
         mb_actions = mb_actions.flatten()
         # start to update network
         vl, al, ent = self._update_network(mb_obs, mb_rewards, mb_actions)
         if update % self.args.log_interval == 0:
             print('[{}] Update: {}/{}, Frames: {}, Rewards: {:.1f}, VL: {:.3f}, PL: {:.3f}, Ent: {:.2f}, Min: {}, Max:{}'.format(\
                 datetime.now(), update, num_updates, (update+1)*(self.args.num_workers * self.args.nsteps),\
                 final_rewards.mean(), vl, al, ent, final_rewards.min(), final_rewards.max()))
             torch.save(self.net.state_dict(), self.model_path + 'model.pt')