예제 #1
0
    def evaluate_policy(self, total_steps):
        """
        Evaluate policy

        Args:
            total_steps (int): Current total steps of training
        """
        avg_test_return = 0.
        avg_test_steps = 0
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            avg_test_steps += 1
            for _ in range(self._episode_max_steps):
                if self._normalize_obs:
                    obs = self._obs_normalizer(obs, update=False)
                act, _ = self._policy.get_action(obs, test=True)
                act = (act if is_discrete(self._env.action_space) else np.clip(
                    act, self._env.action_space.low,
                    self._env.action_space.high))
                next_obs, reward, done, _ = self._test_env.step(act)
                avg_test_steps += 1
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=act,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(replay_buffer.sample(self._episode_max_steps),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes
예제 #2
0
파일: trainer.py 프로젝트: keiohta/tf2rl
    def evaluate_policy(self, total_steps):
        tf.summary.experimental.set_step(total_steps)
        if self._normalize_obs:
            self._test_env.normalizer.set_params(
                *self._env.normalizer.get_params())
        avg_test_return = 0.
        avg_test_steps = 0
        if self._save_test_path:
            replay_buffer = get_replay_buffer(self._policy,
                                              self._test_env,
                                              size=self._episode_max_steps)
        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            obs = self._test_env.reset()
            avg_test_steps += 1
            for _ in range(self._episode_max_steps):
                action = self._policy.get_action(obs, test=True)
                next_obs, reward, done, _ = self._test_env.step(action)
                avg_test_steps += 1
                if self._save_test_path:
                    replay_buffer.add(obs=obs,
                                      act=action,
                                      next_obs=next_obs,
                                      rew=reward,
                                      done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='rgb_array'))
                elif self._show_test_progress:
                    self._test_env.render()
                episode_return += reward
                obs = next_obs
                if done:
                    break
            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)
            if self._save_test_path:
                save_path(
                    replay_buffer._encode_sample(
                        np.arange(self._episode_max_steps)),
                    os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()
            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)
            avg_test_return += episode_return
        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=3),
                tf.uint8)
            tf.summary.image(
                'train/input_img',
                images,
            )
        return avg_test_return / self._test_episodes, avg_test_steps / self._test_episodes
예제 #3
0
    def evaluate_policy(self, total_steps):
        """evaluating the policy."""

        tf.summary.experimental.set_step(total_steps)
        
        total_test_return = 0.
        success_traj = 0
        if self._save_test_path:
            replay_buffer = get_replay_buffer(
                self._policy, self._test_env, size=self._episode_max_steps)

        straight_line_episode = 0 
        no_straight_line_episode = 0
        success_traj_straight_line = 0
        success_traj_no_straight_line = 0

        for i in range(self._test_episodes):
            episode_return = 0.
            frames = []
            workspace, goal, obs = self._test_env.reset()
            start = obs
            reduced_workspace = self._CAE.evaluate(workspace)
            #Concatenate position observation with start, goal, and reduced workspace!!
            obs_full = np.concatenate((obs, goal, reduced_workspace))

            for _ in range(self._episode_max_steps):
                action = self._policy.get_action(obs_full)
                next_obs, reward, done, _ = self._test_env.step(action)
                #Concatenate position observation with start, goal, and reduced workspace!!
                next_obs_full = np.concatenate((obs, goal, reduced_workspace))

                # Add obersvation to the trajectory storage
                self.trajectory.append({'workspace': workspace,'position': obs,
                    'next_position': next_obs,'goal': goal, 'action': action, 'reward': reward, 'done': done})
                
                if self._save_test_path:
                    replay_buffer.add(obs=obs_full, act=action,
                                next_obs=next_obs_full, rew=reward, done=done)

                if self._save_test_movie:
                    frames.append(self._test_env.render(mode='plot'))

                elif self._show_test_progress:
                    self._test_env.render()

                episode_return += reward
                obs = next_obs
                obs_full = next_obs_full
                
                if done:
                    break

            prefix = "step_{0:08d}_epi_{1:02d}_return_{2:010.4f}".format(
                total_steps, i, episode_return)

            if self._save_test_path:
                save_path(replay_buffer._encode_sample(np.arange(self._episode_max_steps)),
                          os.path.join(self._output_dir, prefix + ".pkl"))
                replay_buffer.clear()

            if self._save_test_movie:
                frames_to_gif(frames, prefix, self._output_dir)

            if self._save_test_path_sep:
                self._save_traj_separately(prefix)
                   
            total_test_return += episode_return

            if straight_line_feasible(workspace, start, goal, self._test_env):
                straight_line_episode += 1
                if reward == self._test_env.goal_reward:        
                    success_traj_straight_line += 1
            else:
                no_straight_line_episode += 1
                if reward == self._test_env.goal_reward:        
                    success_traj_no_straight_line += 1

            if reward == self._test_env.goal_reward:        
                success_traj += 1

            # empty trajectory:
            self.trajectory = []

        if self._show_test_images:
            images = tf.cast(
                tf.expand_dims(np.array(obs).transpose(2, 0, 1), axis=2),
                tf.uint8)
            tf.summary.image('train/input_img', images,)

        avg_test_return = total_test_return / self._test_episodes
        success_rate = success_traj / self._test_episodes
        if straight_line_episode > 0:
            success_rate_straight_line = success_traj_straight_line/straight_line_episode
        else:
            success_rate_straight_line = 0
        if no_straight_line_episode > 0:
            success_rate_no_straight_line = success_traj_no_straight_line/no_straight_line_episode
        else:
            success_rate_no_straight_line = 0
        ratio_straight_lines = straight_line_episode/ self._test_episodes

        return avg_test_return, success_rate, ratio_straight_lines, success_rate_straight_line, success_rate_no_straight_line