Exemplo n.º 1
0
    def test_model(self, episode=None): #no explore here
        if episode is None:
            episode = self.tested_episodes


        # Set models to eval mode
        self.transition_model.eval()
        self.observation_model.eval()
        self.reward_model.eval()
        self.encoder.eval()
        
        # Initialise parallelised test environments
        test_envs = EnvBatcher(ControlSuiteEnv, (self.parms.env_name, self.parms.seed, self.parms.max_episode_length, self.parms.bit_depth), {}, self.parms.test_episodes)
        total_steps = self.parms.max_episode_length // test_envs.action_repeat
        rewards = np.zeros(self.parms.test_episodes)
        
        real_rew = torch.zeros([total_steps,self.parms.test_episodes])
        predicted_rew = torch.zeros([total_steps,self.parms.test_episodes])

        with torch.no_grad():
            observation, total_rewards, video_frames = test_envs.reset(), np.zeros((self.parms.test_episodes, )), []            
            belief, posterior_state, action = torch.zeros(self.parms.test_episodes, self.parms.belief_size, device=self.parms.device), torch.zeros(self.parms.test_episodes, self.parms.state_size, device=self.parms.device), torch.zeros(self.parms.test_episodes, self.env.action_size, device=self.parms.device)
            tqdm.write("Testing model.")
            for t in range(total_steps):     
                belief, posterior_state, action, next_observation, rewards, done, pred_next_rew  = self.update_belief_and_act(test_envs,  belief, posterior_state, action, observation.to(device=self.parms.device), list(rewards), self.env.action_range[0], self.env.action_range[1])
                total_rewards += rewards.numpy()
                real_rew[t] = rewards
                predicted_rew[t]  = pred_next_rew

                observation = self.env.get_original_frame().unsqueeze(dim=0)

                video_frames.append(make_grid(torch.cat([observation, self.observation_model(belief, posterior_state).cpu()], dim=3) + 0.5, nrow=5).numpy())  # Decentre
                observation = next_observation
                if done.sum().item() == self.parms.test_episodes:
                    break
            
        real_rew = torch.transpose(real_rew, 0, 1)
        predicted_rew = torch.transpose(predicted_rew, 0, 1)
        
        #save and plot metrics 
        self.tested_episodes += 1
        self.metrics['test_episodes'].append(episode)
        self.metrics['test_rewards'].append(total_rewards.tolist())

        lineplot(self.metrics['test_episodes'], self.metrics['test_rewards'], 'test_rewards', self.statistics_path)
        
        write_video(video_frames, 'test_episode_%s' % str(episode), self.video_path)  # Lossy compression
        # Set models to train mode
        self.transition_model.train()
        self.observation_model.train()
        self.reward_model.train()
        self.encoder.train()
        # Close test environments
        test_envs.close()
        return self.metrics
Exemplo n.º 2
0
        metrics['episodes'].append(episode)
        metrics['train_rewards'].append(total_reward)
        lineplot(metrics['episodes'][-len(metrics['train_rewards']):],
                 metrics['train_rewards'], 'train_rewards', results_dir)

    print("Testing!")
    # Test model
    if episode % args.test_interval == 0:
        # Set models to eval mode
        transition_model.eval()
        observation_model.eval()
        reward_model.eval()
        encoder.eval()
        # Initialise parallelised test environments
        test_envs = EnvBatcher(
            Env, (args.env, args.symbolic_env, args.seed,
                  args.max_episode_length, args.action_repeat, args.bit_depth),
            {}, args.test_episodes)

        with torch.no_grad():
            observation, total_rewards, video_frames = test_envs.reset(
            ), np.zeros((args.test_episodes, )), []
            belief, posterior_state, action = torch.zeros(
                args.test_episodes, args.belief_size,
                device=args.device), torch.zeros(
                    args.test_episodes, args.state_size,
                    device=args.device), torch.zeros(args.test_episodes,
                                                     env.action_size,
                                                     device=args.device)
            pbar = tqdm(range(args.max_episode_length // args.action_repeat))
            for t in pbar:
                belief, posterior_state, action, next_observation, reward, done = update_belief_and_act(
Exemplo n.º 3
0
setup_my_seed(args)
device = get_my_device(args)

# Recorder
results_dir = os.path.join('results', '{}_{}'.format(args.env, args.id))
os.makedirs(results_dir, exist_ok=True)
writer = SummaryWriter(results_dir + "/{}_{}_log".format(args.env, args.id))
metrics = {
    'steps': [], 'episodes': [], 'train_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': [], 'actor_loss': [], 'value_loss': []
}

# Init training env 
env = Env(args.env, args.seed, args.max_episode_length,
          args.action_repeat, args.bit_depth)
# Initialise parallelised test environments
test_envs = EnvBatcher(Env, (args.env, args.seed, args.max_episode_length,
                                     args.action_repeat, args.bit_depth), {}, args.test_episodes)
# Init replay buffer
if args.experience_replay != '' and os.path.exists(args.experience_replay):
    D = torch.load(args.experience_replay)
    metrics['steps'] = [D.steps] * D.episodes
    metrics['episodes'] = list(range(1, D.episodes + 1))
elif not args.test:
    D = ExperienceReplay(args.experience_size, env.observation_size,
                         env.action_size, args.bit_depth, device)
    # Init dataset D with S random seed episodes
    for s in range(1, args.seed_episodes + 1):
        observation, done, t = env.reset(), False, 0
        while not done:
            action = env.sample_random_action()
            next_observation, reward, done = env.step(action)
            D.append(observation, action, reward, done)
Exemplo n.º 4
0
    def test(self, episode):
        print("Test model")
        # Set models to eval mode
        self.transition_model.eval()
        self.observation_model.eval()
        self.reward_model.eval()
        self.encoder.eval()
        self.algorithms.train_to_eval()
        # self.actor_model_g.eval()
        # self.value_model_g.eval()
        # Initialise parallelised test environments
        test_envs = EnvBatcher(
            Env, (args.env, args.symbolic_env, args.seed,
                  args.max_episode_length, args.action_repeat, args.bit_depth),
            {}, args.test_episodes)

        with torch.no_grad():
            observation, total_rewards, video_frames = test_envs.reset(
            ), np.zeros((args.test_episodes, )), []
            belief, posterior_state, action = torch.zeros(
                args.test_episodes, args.belief_size,
                device=args.device), torch.zeros(
                    args.test_episodes, args.state_size,
                    device=args.device), torch.zeros(args.test_episodes,
                                                     self.env.action_size,
                                                     device=args.device)
            pbar = tqdm(range(args.max_episode_length // args.action_repeat))
            for t in pbar:
                belief, posterior_state, action, next_observation, reward, done = self.update_belief_and_act(
                    args, test_envs, belief, posterior_state, action,
                    observation.to(device=args.device))
                total_rewards += reward.numpy()
                if not args.symbolic_env:  # Collect real vs. predicted frames for video
                    video_frames.append(
                        make_grid(torch.cat([
                            observation,
                            self.observation_model(belief,
                                                   posterior_state).cpu()
                        ],
                                            dim=3) + 0.5,
                                  nrow=5).numpy())  # Decentre
                observation = next_observation
                if done.sum().item() == args.test_episodes:
                    pbar.close()
                    break

        # Update and plot reward metrics (and write video if applicable) and save metrics
        self.metrics['test_episodes'].append(episode)
        self.metrics['test_rewards'].append(total_rewards.tolist())

        Save_Txt(self.metrics['test_episodes'][-1],
                 self.metrics['test_rewards'][-1], 'test_rewards',
                 args.results_dir)
        # Save_Txt(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'],'test_rewards_steps', results_dir, xaxis='step')

        # lineplot(metrics['test_episodes'], metrics['test_rewards'], 'test_rewards', results_dir)
        # lineplot(np.asarray(metrics['steps'])[np.asarray(metrics['test_episodes']) - 1], metrics['test_rewards'], 'test_rewards_steps', results_dir, xaxis='step')
        if not args.symbolic_env:
            episode_str = str(episode).zfill(len(str(args.episodes)))
            write_video(video_frames, 'test_episode_%s' % episode_str,
                        args.results_dir)  # Lossy compression
            save_image(
                torch.as_tensor(video_frames[-1]),
                os.path.join(args.results_dir,
                             'test_episode_%s.png' % episode_str))

        torch.save(self.metrics, os.path.join(args.results_dir, 'metrics.pth'))

        # Set models to train mode
        self.transition_model.train()
        self.observation_model.train()
        self.reward_model.train()
        self.encoder.train()
        # self.actor_model_g.train()
        # self.value_model_g.train()
        self.algorithms.eval_to_train()
        # Close test environments
        test_envs.close()
Exemplo n.º 5
0
        lineplot(metrics['episodes'][-len(metrics['train_rewards']):],
                 metrics['train_rewards'], 'train_rewards', results_dir)

    # Test model
    if episode % args.test_interval == 0:
        # Set models to eval mode
        agent.transition_model.eval()
        agent.observation_model.eval()
        agent.reward_model.eval()
        agent.encoder.eval()
        agent.actor_model.eval()
        agent.value_model.eval()

        # Initialise parallelised test environments
        test_envs = EnvBatcher(
            Env, (args.env, args.symbolic, args.seed, args.max_episode_length,
                  args.action_repeat, args.bit_depth), {}, args.test_episodes)

        with torch.no_grad():
            observation = test_envs.reset()
            total_rewards = np.zeros((args.test_episodes, ))
            video_frames = []

            belief = torch.zeros(args.test_episodes,
                                 args.belief_size,
                                 device=args.device)
            posterior_state = torch.zeros(args.test_episodes,
                                          args.state_size,
                                          device=args.device)
            action = torch.zeros(args.test_episodes,
                                 env.action_size,