Exemplo n.º 1
0
    def _eval_policy(self,
                     eval_freq,
                     eval_env,
                     n_eval_episodes,
                     timesteps_since_eval,
                     deterministic=True):
        """
        Evaluate the current policy on a test environment.

        :param eval_env: (gym.Env) Environment that will be used to evaluate the agent
        :param eval_freq: (int) Evaluate the agent every `eval_freq` timesteps (this may vary a little)
        :param n_eval_episodes: (int) Number of episode to evaluate the agent
        :parma timesteps_since_eval: (int) Number of timesteps since last evaluation
        :param deterministic: (bool) Whether to use deterministic or stochastic actions
        :return: (int) Number of timesteps since last evaluation
        """
        if 0 < eval_freq <= timesteps_since_eval and eval_env is not None:
            timesteps_since_eval %= eval_freq
            # Synchronise the normalization stats if needed
            sync_envs_normalization(self.env, eval_env)
            mean_reward, std_reward = evaluate_policy(
                self, eval_env, n_eval_episodes, deterministic=deterministic)
            if self.verbose > 0:
                print("Eval num_timesteps={}, "
                      "episode_reward={:.2f} +/- {:.2f}".format(
                          self.num_timesteps, mean_reward, std_reward))
                print("FPS: {:.2f}".format(self.num_timesteps /
                                           (time.time() - self.start_time)))
        return timesteps_since_eval
Exemplo n.º 2
0
    def _save_best_model_using_eval_callback(self) -> Tuple[float, float]:

        self._save_normalization_artifacts()

        # Sync training and eval env if there is VecNormalize
        sync_envs_normalization(self.training_env, self.eval_env)

        mean_reward, std_reward = custom_evaluate_policy(
            self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=False, deterministic=self.deterministic,
        )

        if mean_reward > self.best_mean_reward_eval:
            # if self.verbose > 0:
            self._logger.debug(
                "{} - New best mean reward eval: {} (vs {})".format(
                    self.num_timesteps, mean_reward, self.best_mean_reward_eval
                )
            )
            self.best_mean_reward_eval = mean_reward
            # Example for saving best model
            if self.verbose > 0:
                self._logger.debug("Saving new best model to {}".format(self.save_path_eval))
            if self.save_model:
                self.model.save(self.save_path_eval)

        return mean_reward, std_reward
def test_sync_vec_normalize():
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    env = VecFrameStack(env, 1)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
    eval_env = VecFrameStack(eval_env, 1)

    env.reset()
    # Initialize running mean
    for _ in range(100):
        env.step([env.action_space.sample()])

    obs = env.reset()
    original_obs = env.get_original_obs()
    # Normalization must be different
    assert not np.allclose(obs, eval_env.normalize_obs(original_obs))

    sync_envs_normalization(env, eval_env)

    # Now they must be synced
    assert np.allclose(obs, eval_env.normalize_obs(original_obs))
    def _on_step(self) -> bool:
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        self.rank = rank
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0 and rank == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                rank=self.rank)
            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)
                np.savez(self.log_path,
                         timesteps=self.evaluations_timesteps,
                         results=self.evaluations_results,
                         ep_lengths=self.evaluations_length)

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            # Keep track of the last evaluation, useful for classes that derive from this callback
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print("Eval num_timesteps={}, "
                      "episode_reward={:.2f} +/- {:.2f}".format(
                          self.num_timesteps, mean_reward, std_reward))
                print("Episode length: {:.2f} +/- {:.2f}".format(
                    mean_ep_length, std_ep_length))

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, 'best_model'))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()
        else:
            pass

        return True
Exemplo n.º 5
0
    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            results = self.evaluate_policy(self.model,
                                      n_eval_episodes=self.n_eval_episodes,
                                      deterministic=self.deterministic)

            if self.log_path is not None:
                self.evaluations_results["evaluation_after_{}_steps".format(self.n_calls)] = results
                print("Storing evaluation results after {} calls.".format(self.n_calls))
                filename = "evaluation_results.json"
                with open(os.path.join(self.log_path, filename), 'w') as f:
                    json.dump(self.evaluations_results, f, indent=4)

        return True
Exemplo n.º 6
0
    def _on_step(self) -> bool:
        # Log additional tensor
        if not self.is_tb_set:
            with self.model.graph.as_default():
                tf.summary.scalar('eval_episode_reward', self.last_mean_reward)
                self.model.summary = tf.summary.merge_all()
            self.is_tb_set = True

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            # self.eval_env.seed(self.seed)

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True)

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)
                np.savez(self.log_path,
                         timesteps=self.evaluations_timesteps,
                         results=self.evaluations_results,
                         ep_lengths=self.evaluations_length)

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            # Keep track of the last evaluation, useful for classes that derive from this callback
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print("Eval num_timesteps={}, "
                      "episode_reward={:.2f} +/- {:.2f}".format(
                          self.num_timesteps, mean_reward, std_reward))
                print("Episode length: {:.2f} +/- {:.2f}".format(
                    mean_ep_length, std_ep_length))

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, 'best_model'))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        # Log episode mean reward
        summary = tf.Summary(value=[
            tf.Summary.Value(tag='eval_episode_reward',
                             simple_value=self.last_mean_reward)
        ])
        self.locals['writer'].add_summary(summary, self.num_timesteps)

        return True
Exemplo n.º 7
0
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)
                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards)
            mean_ep_length, std_ep_length = (
                np.mean(episode_lengths),
                np.std(episode_lengths),
            )
            # Keep track of the last evaluation, useful for classes that derive from this callback
            self.last_mean_reward = mean_reward

            self.num_evaluation_steps += 1

            mean_reward_summary = tf.Summary(value=[tf.Summary.Value(tag="eval_mean_reward", simple_value=mean_reward)])
            std_reward_summary = tf.Summary(value=[tf.Summary.Value(tag="eval_std_reward", simple_value=std_reward)])
            mean_ep_length_summary = tf.Summary(
                value=[tf.Summary.Value(tag="eval_mean_ep_length", simple_value=mean_ep_length)]
            )
            std_ep_length_summary = tf.Summary(value=[tf.Summary.Value(tag="eval_std_ep_length", simple_value=std_ep_length)])
            self.locals["writer"].add_summary(mean_reward_summary, self.num_evaluation_steps)
            self.locals["writer"].add_summary(std_reward_summary, self.num_evaluation_steps)
            self.locals["writer"].add_summary(mean_ep_length_summary, self.num_evaluation_steps)
            self.locals["writer"].add_summary(std_ep_length_summary, self.num_evaluation_steps)

            if self.verbose > 0:
                self._logger.debug(
                    "Eval num_timesteps={}, "
                    "episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, mean_reward, std_reward)
                )
                self._logger.debug("Episode length: {:.2f} +/- {:.2f}".format(mean_ep_length, std_ep_length))

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    self._logger.debug("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True