예제 #1
0
    def _evaluate(self, policies, evaluation_env):
        """Perform evaluation for the current policy."""

        if self._eval_n_episodes < 1:
            return

        # TODO: max_path_length should be a property of environment.
        paths = rollouts(evaluation_env, policies['seek'],
                         self.sampler._max_path_length, self._eval_n_episodes)

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))

        evaluation_env.log_diagnostics(paths)
        if self._eval_render:
            evaluation_env.render(paths)

        if self.sampler.batch_ready():
            batch = self.sampler.random_batch()
            self.log_diagnostics(batch)

        return paths
예제 #2
0
    def _evaluate(self, policy, evaluation_env):
        """Perform evaluation for the current policy."""

        if self._eval_n_episodes < 1:
            return

        # TODO: max_path_length should be a property of environment.
        paths = rollouts(evaluation_env, policy, self.sampler._max_path_length,
                         self._eval_n_episodes)

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))

        evaluation_env.log_diagnostics(paths)
        if self._eval_render:
            evaluation_env.render(paths)

        if self.sampler.batch_ready():
            batch = self.sampler.random_batch()
            self.log_diagnostics(batch)
예제 #3
0
    def _evaluate(self, epoch):
        """Perform evaluation for the current policy.

        :param epoch: The epoch number.
        :return: None
        """

        if self._eval_n_episodes < 1:
            return

        paths = rollouts(self._eval_env, self.policy, self._max_path_length,
                         self._eval_n_episodes)

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))
        logger.record_tabular('epoch', epoch)

        self._eval_env.log_diagnostics(paths)
        if self._eval_render:
            self._eval_env.render(paths)

        batch = self.pool.random_batch(self._batch_size)
        self.log_diagnostics(batch)
예제 #4
0
    def _evaluate(self, epoch):
        """Perform evaluation for the current policy.

        :param epoch: The epoch number.
        :return: None
        """

        if self._eval_n_episodes < 1:
            return

        paths = rollouts(self._eval_env, self.policy, self._max_path_length,
                         self._eval_n_episodes)

        total_returns = [path['rewards'].sum() for path in paths]
        episode_lengths = [len(p['rewards']) for p in paths]

        logger.record_tabular('return-average', np.mean(total_returns))
        logger.record_tabular('return-min', np.min(total_returns))
        logger.record_tabular('return-max', np.max(total_returns))
        logger.record_tabular('return-std', np.std(total_returns))
        logger.record_tabular('episode-length-avg', np.mean(episode_lengths))
        logger.record_tabular('episode-length-min', np.min(episode_lengths))
        logger.record_tabular('episode-length-max', np.max(episode_lengths))
        logger.record_tabular('episode-length-std', np.std(episode_lengths))
        logger.record_tabular('epoch', epoch)

        self._eval_env.log_diagnostics(paths)
        if self._eval_render:
            self._eval_env.render(paths)

        batch = self.pool.random_batch(self._batch_size)
        self.log_diagnostics(batch)
예제 #5
0
    def _evaluate(self, epoch):

        logger.log("Collecting samples for evaluation")
        snapshot_dir = logger.get_snapshot_dir()

        paths = rollouts(self._env, self._eval_policy, self._max_path_length,
                         self._n_eval_episodes)

        average_discounted_return = np.mean([
            special.discount_return(path["rewards"], self._discount)
            for path in paths
        ])
        returns = np.asarray([sum(path["rewards"]) for path in paths])

        statistics = OrderedDict([
            ('Epoch', epoch),
            ('AverageDiscountedReturn', average_discounted_return),
            ('Alpha', self._alpha), ('returns', returns)
        ])

        for key, value in statistics.items():
            logger.record_tabular(key, value)

        self._env.log_diagnostics(paths)

        # Plot test paths.
        if (hasattr(self._env, 'plot_paths')
                and self._env_plot_settings is not None):
            img_file = os.path.join(snapshot_dir, 'env_itr_%05d.png' % epoch)
            # Remove previous paths.
            if self._env_lines is not None:
                [path.remove() for path in self._env_lines]
            self._env_lines = self._env.plot_paths(paths, self._ax_env)
            plt.pause(0.001)
            plt.draw()
            self._fig_env.savefig(img_file, dpi=100)

        # Plot the Q-function level curves and action samples.
        if (hasattr(self._qf_eval, 'plot_level_curves')
                and self._q_plot_settings is not None):
            img_file = os.path.join(snapshot_dir, 'q_itr_%05d.png' % epoch)
            [ax.clear() for ax in self._ax_q_lst]
            self._qf_eval.plot_level_curves(
                ax_lst=self._ax_q_lst,
                observations=self._q_plot_settings['obs_lst'],
                action_dims=self._q_plot_settings['action_dims'],
                xlim=self._q_plot_settings['xlim'],
                ylim=self._q_plot_settings['ylim'],
            )
            self._visualization_policy.plot_samples(
                self._ax_q_lst, self._q_plot_settings['obs_lst'])
            for ax in self._ax_q_lst:
                ax.set_xlim(self._q_plot_settings['xlim'])
                ax.set_ylim(self._q_plot_settings['ylim'])
            plt.pause(0.001)
            plt.draw()
            self._fig_q.savefig(img_file, dpi=100)

        gc.collect()