def _eval_policy(self, eval_freq, eval_env, n_eval_episodes, timesteps_since_eval, deterministic=True): """ Evaluate the current policy on a test environment. :param eval_env: (gym.Env) Environment that will be used to evaluate the agent :param eval_freq: (int) Evaluate the agent every `eval_freq` timesteps (this may vary a little) :param n_eval_episodes: (int) Number of episode to evaluate the agent :parma timesteps_since_eval: (int) Number of timesteps since last evaluation :param deterministic: (bool) Whether to use deterministic or stochastic actions :return: (int) Number of timesteps since last evaluation """ if 0 < eval_freq <= timesteps_since_eval and eval_env is not None: timesteps_since_eval %= eval_freq # Synchronise the normalization stats if needed sync_envs_normalization(self.env, eval_env) mean_reward, std_reward = evaluate_policy( self, eval_env, n_eval_episodes, deterministic=deterministic) if self.verbose > 0: print("Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format( self.num_timesteps, mean_reward, std_reward)) print("FPS: {:.2f}".format(self.num_timesteps / (time.time() - self.start_time))) return timesteps_since_eval
def _save_best_model_using_eval_callback(self) -> Tuple[float, float]: self._save_normalization_artifacts() # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) mean_reward, std_reward = custom_evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=False, deterministic=self.deterministic, ) if mean_reward > self.best_mean_reward_eval: # if self.verbose > 0: self._logger.debug( "{} - New best mean reward eval: {} (vs {})".format( self.num_timesteps, mean_reward, self.best_mean_reward_eval ) ) self.best_mean_reward_eval = mean_reward # Example for saving best model if self.verbose > 0: self._logger.debug("Saving new best model to {}".format(self.save_path_eval)) if self.save_model: self.model.save(self.save_path_eval) return mean_reward, std_reward
def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs))
def _on_step(self) -> bool: comm = MPI.COMM_WORLD rank = comm.Get_rank() self.rank = rank if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0 and rank == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, rank=self.rank) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) np.savez(self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) # Keep track of the last evaluation, useful for classes that derive from this callback self.last_mean_reward = mean_reward if self.verbose > 0: print("Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format( self.num_timesteps, mean_reward, std_reward)) print("Episode length: {:.2f} +/- {:.2f}".format( mean_ep_length, std_ep_length)) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, 'best_model')) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() else: pass return True
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) results = self.evaluate_policy(self.model, n_eval_episodes=self.n_eval_episodes, deterministic=self.deterministic) if self.log_path is not None: self.evaluations_results["evaluation_after_{}_steps".format(self.n_calls)] = results print("Storing evaluation results after {} calls.".format(self.n_calls)) filename = "evaluation_results.json" with open(os.path.join(self.log_path, filename), 'w') as f: json.dump(self.evaluations_results, f, indent=4) return True
def _on_step(self) -> bool: # Log additional tensor if not self.is_tb_set: with self.model.graph.as_default(): tf.summary.scalar('eval_episode_reward', self.last_mean_reward) self.model.summary = tf.summary.merge_all() self.is_tb_set = True if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) # self.eval_env.seed(self.seed) episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) np.savez(self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length) mean_reward, std_reward = np.mean(episode_rewards), np.std( episode_rewards) mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std( episode_lengths) # Keep track of the last evaluation, useful for classes that derive from this callback self.last_mean_reward = mean_reward if self.verbose > 0: print("Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format( self.num_timesteps, mean_reward, std_reward)) print("Episode length: {:.2f} +/- {:.2f}".format( mean_ep_length, std_ep_length)) if mean_reward > self.best_mean_reward: if self.verbose > 0: print("New best mean reward!") if self.best_model_save_path is not None: self.model.save( os.path.join(self.best_model_save_path, 'best_model')) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() # Log episode mean reward summary = tf.Summary(value=[ tf.Summary.Value(tag='eval_episode_reward', simple_value=self.last_mean_reward) ]) self.locals['writer'].add_summary(summary, self.num_timesteps) return True
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: # Sync training and eval env if there is VecNormalize sync_envs_normalization(self.training_env, self.eval_env) episode_rewards, episode_lengths = evaluate_policy( self.model, self.eval_env, n_eval_episodes=self.n_eval_episodes, render=self.render, deterministic=self.deterministic, return_episode_rewards=True, ) if self.log_path is not None: self.evaluations_timesteps.append(self.num_timesteps) self.evaluations_results.append(episode_rewards) self.evaluations_length.append(episode_lengths) np.savez( self.log_path, timesteps=self.evaluations_timesteps, results=self.evaluations_results, ep_lengths=self.evaluations_length, ) mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards) mean_ep_length, std_ep_length = ( np.mean(episode_lengths), np.std(episode_lengths), ) # Keep track of the last evaluation, useful for classes that derive from this callback self.last_mean_reward = mean_reward self.num_evaluation_steps += 1 mean_reward_summary = tf.Summary(value=[tf.Summary.Value(tag="eval_mean_reward", simple_value=mean_reward)]) std_reward_summary = tf.Summary(value=[tf.Summary.Value(tag="eval_std_reward", simple_value=std_reward)]) mean_ep_length_summary = tf.Summary( value=[tf.Summary.Value(tag="eval_mean_ep_length", simple_value=mean_ep_length)] ) std_ep_length_summary = tf.Summary(value=[tf.Summary.Value(tag="eval_std_ep_length", simple_value=std_ep_length)]) self.locals["writer"].add_summary(mean_reward_summary, self.num_evaluation_steps) self.locals["writer"].add_summary(std_reward_summary, self.num_evaluation_steps) self.locals["writer"].add_summary(mean_ep_length_summary, self.num_evaluation_steps) self.locals["writer"].add_summary(std_ep_length_summary, self.num_evaluation_steps) if self.verbose > 0: self._logger.debug( "Eval num_timesteps={}, " "episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, mean_reward, std_reward) ) self._logger.debug("Episode length: {:.2f} +/- {:.2f}".format(mean_ep_length, std_ep_length)) if mean_reward > self.best_mean_reward: if self.verbose > 0: self._logger.debug("New best mean reward!") if self.best_model_save_path is not None: self.model.save(os.path.join(self.best_model_save_path, "best_model")) self.best_mean_reward = mean_reward # Trigger callback if needed if self.callback is not None: return self._on_event() return True