示例#1
0
 def _on_step(self):
     if self.n_calls % self._check_freq == 0:
         sync_envs_normalization(self.training_env, self._eval_env)
         video, rewards = render_trajectory(self._eval_env, self.model)
         self.logger.record("trajectory/video", video)
         self.logger.record("trajectory/return", sum(rewards))
     return True
示例#2
0
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            # self.n_eval_episodes = len(self.eval_env.get_attr("config"))
            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)
                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                )
            mean_reward, std_reward = np.mean(episode_rewards), np.std(episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, " f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}")
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            dists = [10/reward for reward in episode_rewards]
            mean_dist = np.mean(dists)
            mean_reward = 1/mean_dist
            self.last_mean_reward = mean_reward

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")

                print(f"New best mean reward: {mean_reward:.4f} with mean distance: {mean_dist:.4f}")
                self.best_mean_reward = mean_reward
                
                if self.best_model_save_path is not None and self.best_mean_reward > self.best_reward:
                    self.best_reward = self.best_mean_reward  
                    self.model.save(os.path.join(self.best_model_save_path, "best_model"))
                    with open(f"{self.best_model_save_path}/best_reward.npy", 'wb') as f:
                        np.save(f, np.array([self.best_reward]))

                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True
示例#3
0
def test_sync_vec_normalize():
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    env = VecFrameStack(env, 1)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
    eval_env = VecFrameStack(eval_env, 1)

    env.reset()
    # Initialize running mean
    for _ in range(100):
        env.step([env.action_space.sample()])

    obs = env.reset()
    original_obs = env.get_original_obs()
    dummy_rewards = np.random.rand(10)
    # Normalization must be different
    assert not np.allclose(obs, eval_env.normalize_obs(original_obs))

    sync_envs_normalization(env, eval_env)

    # Now they must be synced
    assert np.allclose(obs, eval_env.normalize_obs(original_obs))
    assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
示例#4
0
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)
                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, "
                      f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}"
                )
            """ Add prefix """
            # Add to current Logger
            self.logger.record(f"eval/{self.prefix}_mean_reward",
                               float(mean_reward))
            # self.logger.record(f"eval/{self.prefix}_mean_ep_length", mean_ep_length)

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = mean_reward
                self.update_best_reward(self.eval_env.envs[0].robot.robot_id,
                                        self.best_mean_reward)
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True
示例#5
0
def test_sync_vec_normalize(make_env):
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env,
                       norm_obs=True,
                       norm_reward=True,
                       clip_obs=100.0,
                       clip_reward=100.0)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    if not isinstance(env.observation_space, spaces.Dict):
        env = VecFrameStack(env, 1)
        assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env,
                            training=False,
                            norm_obs=True,
                            norm_reward=True,
                            clip_obs=100.0,
                            clip_reward=100.0)

    if not isinstance(env.observation_space, spaces.Dict):
        eval_env = VecFrameStack(eval_env, 1)

    env.seed(0)
    env.action_space.seed(0)

    env.reset()
    # Initialize running mean
    latest_reward = None
    for _ in range(100):
        _, latest_reward, _, _ = env.step([env.action_space.sample()])

    # Check that unnormalized reward is same as original reward
    original_latest_reward = env.get_original_reward()
    assert np.allclose(original_latest_reward,
                       env.unnormalize_reward(latest_reward))

    obs = env.reset()
    dummy_rewards = np.random.rand(10)
    original_obs = env.get_original_obs()
    # Check that unnormalization works
    assert allclose(original_obs, env.unnormalize_obs(obs))
    # Normalization must be different (between different environments)
    assert not allclose(obs, eval_env.normalize_obs(original_obs))

    # Test syncing of parameters
    sync_envs_normalization(env, eval_env)
    # Now they must be synced
    assert allclose(obs, eval_env.normalize_obs(original_obs))
    assert allclose(env.normalize_reward(dummy_rewards),
                    eval_env.normalize_reward(dummy_rewards))
 def do_eval(self):
     # Sync training and eval env if there is VecNormalize
     sync_envs_normalization(self.training_env, self.eval_env)
     return evaluate_policy(self.model, self.eval_env,
                            n_eval_episodes=self.n_eval_episodes,
                            deterministic=self.deterministic,
                            global_step=self.num_timesteps,
                            commit_logs=None,
                            debug=self.debug,
                            file_log_path=self.log_path,
                            main_prefix=self.prefix,
                            verbose=self.verbose)
示例#7
0
    def _on_step(self) -> bool:
        if not (self.eval_freq > 0 and self.n_calls % self.eval_freq == 0):
            return True
        # Sync training and eval env if there is VecNormalize
        sync_envs_normalization(self.training_env, self.eval_env)

        if self.model is None or self.model.policy is None:
            raise ValueError("Model/policy is None.")
        env = self.eval_env.envs[0]
        results = []
        for _ in range(self.n_eval_episodes):
            env.reset()
            ep_results = util.eval_episode(
                self.model.policy,
                self.model.policy.mlp_extractor,
                env,
                self.cfg != "none",
            )
            results.append(ep_results)

        episode_rewards = [r["total_reward"] for r in results]
        episode_lengths = [r["steps"] for r in results]
        bn_activations = np.concatenate([r["bn_activations"] for r in results])

        entropy = util.get_entropy(bn_activations)
        self.writer.add_scalar("entropy", entropy, self.num_timesteps)

        if self.log_path is not None:
            self.evaluations_timesteps.append(self.num_timesteps)
            self.evaluations_results.append(sum(episode_rewards) / len(episode_rewards))
            self.evaluations_length.append(np.mean(episode_lengths))
            np.savez(
                self.log_path / "data",
                timesteps=self.evaluations_timesteps,
                results=self.evaluations_results,
                ep_lengths=self.evaluations_length,
            )

        mean_reward, _ = np.mean(episode_rewards), np.std(episode_rewards)
        mean_ep_length, _ = np.mean(episode_lengths), np.std(episode_lengths)
        self.writer.add_scalar("mean_reward", float(mean_reward), self.num_timesteps)
        self.writer.add_scalar("mean_ep_length", mean_ep_length, self.num_timesteps)
        self.writer.add_scalar("rate", self.num_timesteps, self.num_timesteps)
        torch.save(
            self.model.policy.state_dict(),
            self.log_path / f"model-{self.num_timesteps}.pt",
        )
        return True
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'):
    print('record_video function')
    # Wrap the env in a Vec Video Recorder 
    local_eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)])
    local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    sync_envs_normalization(train_env, local_eval_env)
    local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath,
                              record_video_trigger=lambda step: step == 0, video_length=videoLength,
                              name_prefix=prefix)
    obs = local_eval_env.reset()
    for _ in range(videoLength):
        action, _ = model.predict(obs)
        obs, _, _, _ = local_eval_env.step(action)

    # Close the video recorder
    local_eval_env.close()
    def _on_step(self):
        sync_envs_normalization(self.training_env, self.eval_env)

        episode_rewards, episode_lengths = evaluate_policy(self.model, self.eval_env,
                                                           n_eval_episodes=self.n_eval_episodes,
                                                           render=False,
                                                           deterministic=self.deterministic,
                                                           return_episode_rewards=True)

        episode_reward_mean, std_reward = np.mean(
            episode_rewards), np.std(episode_rewards)
        mean_ep_length, std_ep_length = np.mean(
            episode_lengths), np.std(episode_lengths)

        report(
            episode_reward_mean=episode_reward_mean,
            std_reward=std_reward,
            mean_ep_length=mean_ep_length,
            std_ep_length=std_ep_length
        )
示例#10
0
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, "
                      f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}"
                )
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose > 0:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True
示例#11
0
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            if self.model.get_vec_normalize_env() is not None:
                try:
                    sync_envs_normalization(self.training_env, self.eval_env)
                except AttributeError:
                    raise AssertionError(
                        "Training and eval env are not wrapped the same way, "
                        "see https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html#evalcallback "
                        "and warning above.")

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, "
                      f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}"
                )
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose > 0:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            # Dump log so the evaluation results are printed with the correct timestep
            self.logger.record("time/total_timesteps",
                               self.num_timesteps,
                               exclude="tensorboard")
            self.logger.dump(self.num_timesteps)

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, "best_model"))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True
示例#12
0
    def _on_step(self) -> bool:

        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            # Reset success rate buffer
            self._is_success_buffer = []
            #episodes_rewards, episodes_lengths, episodes_powers, episodes_comfort_violations, episodes_comfort_penalties, episodes_power_penalties
            episodes_data = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                callback=None,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(
                    episodes_data['episodes_rewards'])
                self.evaluations_length.append(
                    episodes_data['episodes_lengths'])
                self.evaluations_power_consumption.append(
                    episodes_data['episodes_powers'])
                self.evaluations_comfort_violation.append(
                    episodes_data['episodes_comfort_violations'])
                self.evaluations_comfort_penalty.append(
                    episodes_data['episodes_comfort_penalties'])
                self.evaluations_power_penalty.append(
                    episodes_data['episodes_power_penalties'])

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    ep_powers=self.evaluations_power_consumption,
                    ep_comfort_violations=self.evaluations_comfort_violation,
                    episodes_comfort_penalties=self.evaluations_comfort_penalty,
                    episodes_power_penalties=self.evaluations_power_penalty,
                    **kwargs,
                )

            mean_reward, std_reward = np.mean(
                episodes_data['episodes_rewards']), np.std(
                episodes_data['episodes_rewards'])
            mean_ep_length, std_ep_length = np.mean(
                episodes_data['episodes_lengths']), np.std(
                episodes_data['episodes_lengths'])

            self.evaluation_metrics['mean_rewards'] = mean_reward
            self.evaluation_metrics['std_rewards'] = std_reward
            self.evaluation_metrics['mean_ep_length'] = mean_ep_length
            self.evaluation_metrics['mean_power_consumption'] = np.mean(
                episodes_data['episodes_powers'])
            self.evaluation_metrics['comfort_violation(%)'] = np.mean(
                episodes_data['episodes_comfort_violations'])
            self.evaluation_metrics['comfort_penalty'] = np.mean(
                episodes_data['episodes_comfort_penalties'])
            self.evaluation_metrics['power_penalty'] = np.mean(
                episodes_data['episodes_power_penalties'])

            if self.verbose > 0:
                print(
                    f"Eval num_timesteps={self.num_timesteps}, "
                    f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}")
            # Add to current Logger
            for key, metric in self.evaluation_metrics.items():
                self.logger.record('eval/' + key, metric)

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose > 0:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            if mean_reward > self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(
                            self.best_model_save_path,
                            'model.zip'))
                self.best_mean_reward = mean_reward
                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True
    def _on_step(self) -> bool:
        if self.n_calls > 0 and (self.n_calls - 1) % self.eval_freq == 0:
            self.old_params = [
                param.clone() for param in self.model.policy.parameters()
            ]
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Sync training and eval env if there is VecNormalize
            sync_envs_normalization(self.training_env, self.eval_env)

            # Reset success rate buffer
            self._is_success_buffer = []

            episode_rewards, episode_lengths = evaluate_policy(
                self.model,
                self.eval_env,
                n_eval_episodes=self.n_eval_episodes,
                render=self.render,
                deterministic=self.deterministic,
                return_episode_rewards=True,
                warn=self.warn,
                callback=self._log_success_callback,
            )

            if self.log_path is not None:
                self.evaluations_timesteps.append(self.num_timesteps)
                self.evaluations_results.append(episode_rewards)
                self.evaluations_length.append(episode_lengths)

                kwargs = {}
                # Save success log if present
                if len(self._is_success_buffer) > 0:
                    self.evaluations_successes.append(self._is_success_buffer)
                    kwargs = dict(successes=self.evaluations_successes)

                np.savez(
                    self.log_path,
                    timesteps=self.evaluations_timesteps,
                    results=self.evaluations_results,
                    ep_lengths=self.evaluations_length,
                    **kwargs,
                )

            mean_reward, std_reward = np.mean(episode_rewards), np.std(
                episode_rewards)
            mean_ep_length, std_ep_length = np.mean(episode_lengths), np.std(
                episode_lengths)
            self.last_mean_reward = mean_reward

            if self.verbose > 0:
                print(f"Eval num_timesteps={self.num_timesteps}, "
                      f"episode_reward={mean_reward:.2f} +/- {std_reward:.2f}")
                print(
                    f"Episode length: {mean_ep_length:.2f} +/- {std_ep_length:.2f}"
                )
            # Add to current Logger
            self.logger.record("eval/mean_reward", float(mean_reward))
            self.logger.record("eval/mean_ep_length", mean_ep_length)

            if len(self._is_success_buffer) > 0:
                success_rate = np.mean(self._is_success_buffer)
                if self.verbose > 0:
                    print(f"Success rate: {100 * success_rate:.2f}%")
                self.logger.record("eval/success_rate", success_rate)

            # Dump log so the evaluation results are printed with the correct timestep
            self.logger.record("time/total timesteps",
                               self.num_timesteps,
                               exclude="tensorboard")
            self.logger.dump(self.num_timesteps)

            if mean_reward >= self.best_mean_reward:
                if self.verbose > 0:
                    print("New best mean reward!")
                if self.best_model_save_path is not None:
                    self.model.save(
                        os.path.join(self.best_model_save_path, "checkpoint"))
                self.best_mean_reward = mean_reward
                os.makedirs(self.log_path, exist_ok=True)
                save_path = os.path.join(self.log_path, "checkpoint")
                self.save_folders.append(save_path)
                model_parameters = self.model.policy.state_dict()
                grads = OrderedDict([
                    (name, param.grad)
                    for name, param in model_parameters.items()
                ])
                torch.save(model_parameters,
                           os.path.join(self.log_path, "parameters.th"))
                torch.save(grads, os.path.join(self.log_path, "grads.th"))

                if self.old_params is not None:
                    delta = OrderedDict([
                        (name, param - old_param)
                        for old_param, (name, param) in zip(
                            self.old_params, model_parameters.items())
                    ])
                    torch.save(delta,
                               os.path.join(self.log_path, "prev_step.th"))

                # Trigger callback if needed
                if self.callback is not None:
                    return self._on_event()

        return True