def main(args): expert = None expert_state_dim = 0 if args.policy_path is not None: policy_path = args.policy_path expert = PPO.load(policy_path) expert_state_dim = expert.observation_space.shape[0] factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) if args.stats_path is not None: env = VecNormalize.load(args.stats_path, env) env.training = False else: env = VecNormalize(env, training=False) obs = env.reset() env.render() total_reward = 0 while True: if expert is None: action = env.action_space.sample() action = np.zeros_like(action) else: good_obs = obs[:, :expert_state_dim] action, _ = expert.predict(good_obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() reward = env.get_original_reward() total_reward += reward[0] if done: print("Total reward: {:.3f}".format(total_reward)) obs = env.reset() total_reward = 0
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))