def record_video(env_id, model, video_length=500, prefix='', video_folder='videos'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) val_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(eval_env, video_folder='tmp', record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() for i in range(video_length): action, _ = model.predict(obs) obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def pybullet_example(): # PyBullet: Normalizing input features import pybullet_envs env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) # Automatically normalize the input features and reward. env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0) model = PPO("MlpPolicy", env) model.learn(total_timesteps=2000) # Don't forget to save the VecNormalize statistics when saving the agent. log_dir = "/tmp/" model.save(log_dir + "ppo_halfcheetah") stats_path = os.path.join(log_dir, "vec_normalize.pkl") env.save(stats_path) # To demonstrate loading. del model, env # Load the saved statistics. env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")]) env = VecNormalize.load(stats_path, env) # Do not update them at test time. env.training = False # reward normalization is not needed at test time. env.norm_reward = False # Load the agent. model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False): print("Testing:") print(f" Seed {seed}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}") eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}") return total_reward, distance_x
def test_vec_env(tmp_path, make_env): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) if isinstance(obs, dict): for key in obs.keys(): assert np.max(np.abs(obs[key])) <= clip_obs else: assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = tmp_path / "vec_normalize" norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def create_zoo_env(env_id, stats_dir, hyperparams, should_render=False): env_wrapper = get_wrapper_class(hyperparams) vec_env_cls = DummyVecEnv if "Bullet" in env_id and should_render: vec_env_cls = SubprocVecEnv env = make_vec_env(env_id, wrapper_class=env_wrapper, vec_env_cls=vec_env_cls) if stats_dir is not None: if hyperparams["normalize"]: norm_fpath = pjoin(stats_dir, "vecnormalize.pkl") if os.path.exists(norm_fpath): env = VecNormalize.load(norm_fpath, env) env.training = False env.norm_reward = False else: raise ValueError(f"VecNormalize stats {norm_fpath} not found") max_episode_steps = gym.make(env_id).spec.max_episode_steps Spec = namedtuple("Spec", ["max_episode_steps"]) env.spec = Spec(max_episode_steps=max_episode_steps) return env
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def main(args): expert = None expert_state_dim = 0 if args.policy_path is not None: policy_path = args.policy_path expert = PPO.load(policy_path) expert_state_dim = expert.observation_space.shape[0] factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) if args.stats_path is not None: env = VecNormalize.load(args.stats_path, env) env.training = False else: env = VecNormalize(env, training=False) obs = env.reset() env.render() total_reward = 0 while True: if expert is None: action = env.action_space.sample() action = np.zeros_like(action) else: good_obs = obs[:, :expert_state_dim] action, _ = expert.predict(good_obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() reward = env.get_original_reward() total_reward += reward[0] if done: print("Total reward: {:.3f}".format(total_reward)) obs = env.reset() total_reward = 0
def main(): # multiprocess environment # n_cpu = 8 # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) n_cpu = 1 env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=False) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def run_environment( algorithm: RLAlgorithm = typer.Option(...), agent_type: SingleOrMultiAgent = SingleOrMultiAgent.single_agent, agent_parameters_path: Optional[Path] = None, random_agent: bool = False, seed: Optional[int] = None, environment_port: Optional[int] = None, normalize: bool = False, n_envs: Optional[int] = None): """Run the reacher environment and visualize the actions of the agents. Args: agent_type: choice between single and multi agent environments agent_parameters_path: an optional path to load the agent parameters from random_agent: if true, agent(s) use a random policy seed: seed for the environment; if not set, it will be picked randomly environment_port: the port used from python to communicate with the C# environment backend. By using different values, one can run multiple environments in parallel. """ env = create_environment(agent_type=agent_type, normalize=False, n_envs=n_envs, env_seed=seed, environment_port=environment_port, training_mode=False, no_graphics=False) if normalize: env = VecNormalize.load( str(agent_parameters_path.parent / 'vecnormalize.pkl'), env) action_size = env.action_space.shape[0] if random_agent: agent = RandomAgent(number_of_agents=n_envs, action_size=action_size) else: agent = TrainedAgent(algorithm=algorithm, parameters_path=str(agent_parameters_path)) score = 0 state = env.reset() while True: actions = agent.act(state) state, reward, done, _ = env.step(actions) score += reward time.sleep(0.005) if np.any(done): break if agent_type == SingleOrMultiAgent.single_agent: print(f'Total score this episode: {score}') else: print(f'Average total score this episode: {np.array(score).mean()}') env.close()
def test(self, model_filename, vnorm_filename): self.model.load(model_filename) self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env) self.eval_env.render() obs = self.eval_env.reset() with self.model.policy.features_extractor.start_testing(): for i in range(1000): action = self.model.predict(obs, deterministic=True) self.eval_env.step(action) self.eval_env.close()
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"): print("Testing:") total_rewards = [] distance_xs = [] for i in range(test_n): print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}") print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}") eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class) eval_env = DummyVecEnv([eval_env]) eval_env = VecNormalize.load(vec_filename, eval_env) eval_env.norm_reward = False eval_env.seed(seed+i) model = PPO.load(model_filename) obs = eval_env.reset() if render: eval_env.env_method("set_view") distance_x = 0 # print(obs) total_reward = 0 for step in range(1000): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = eval_env.step(action) if done: break else: # the last observation will be after reset, so skip the last distance_x = eval_env.envs[0].robot.body_xyz[0] total_reward += reward[0] if render: time.sleep(0.01) eval_env.close() print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}") total_rewards.append(total_reward) distance_xs.append(distance_x) # avoid yaml turn float64 to numpy array total_rewards = [float(x) for x in total_rewards] distance_xs = [float(x) for x in distance_xs] data = { "title": "test", "train": train, "test": test, "total_reward": total_rewards, "distance_x": distance_xs, } with open(f"{save_file}", "w") as f: yaml.dump(data, f)
def main(): args = parse_arguments() load_path = os.path.join("logs", args.env, args.agent, "best_model.zip") stats_path = os.path.join(args.log_dir, args.env, args.agent, "vec_normalize.pkl") if args.agent == 'ddpg': from stable_baselines3 import DDPG model = DDPG.load(load_path) elif args.agent == 'td3': from stable_baselines3 import TD3 model = TD3.load(load_path) elif args.agent == 'ppo': from stable_baselines3 import PPO model = PPO.load(load_path) env = make_vec_env(args.env, n_envs=1) env = VecNormalize.load(stats_path, env) # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False # env = gym.make(args.env) img = [] if args.render: env.render('human') done = False obs = env.reset() action = model.predict(obs) if args.gif: img.append(env.render('rgb_array')) if args.timesteps is None: while not done: action, _= model.predict(obs) obs, reward, done, info = env.step(action) if args.gif: img.append(env.render('rgb_array')) else: env.render() else: for i in range(args.timesteps): action, _= model.predict(obs) obs, reward, done, info = env.step(action) if args.gif: img.append(env.render('rgb_array')) else: env.render() if args.gif: imageio.mimsave(f'{os.path.join("logs", args.env, args.agent, "recording.gif")}', [np.array(img) for i, img in enumerate(img) if i%2 == 0], fps=29)
def load_training_env(env_id, env_path, log_dir, max_train_ep_length, seed): """Load a saved vectorized training env (used to continue training).""" env = gym.make(env_id) env.seed(seed) # Set random seed env = TimeLimitWrapper( env, max_train_ep_length) # Limit length of training episodes env = Monitor(env, log_dir) # Monitor training env = NormalizeActionWrapper(env) # Normalize action space env = DummyVecEnv([lambda: env]) # Vectorize environment env = VecNormalize.load(env_path, env) env.reset() return env
def load_visualization_env(env_id, env_path, seed=0): """ Create an environment using the saved statistics of the training vectorized env (used to visualize performance). """ env = gym.make(env_id) env.seed(seed) env = Monitor( env ) # Used to ensure original action space is not modified by `NormalizeActionWrapper` env = NormalizeActionWrapper(env) env = DummyVecEnv([lambda: env]) env = VecNormalize.load(env_path, env) return env
def test_vec_env(tmpdir): """Test VecNormalize Object""" clip_obs = 0.5 clip_reward = 5.0 orig_venv = DummyVecEnv([make_env]) norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward) _, done = norm_venv.reset(), [False] while not done[0]: actions = [norm_venv.action_space.sample()] obs, rew, done, _ = norm_venv.step(actions) assert np.max(np.abs(obs)) <= clip_obs assert np.max(np.abs(rew)) <= clip_reward path = str(tmpdir.join("vec_normalize")) norm_venv.save(path) deserialized = VecNormalize.load(path, venv=orig_venv) check_vec_norm_equal(norm_venv, deserialized)
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) # if args.stats_path is None: envs = VecNormalize(envs) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs) learner.learn(total_timesteps=10000000, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs) learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def test_current_exp(args): if args.save_img: all_folders = glob.glob(os.path.join(img_path,"*")) all_folders = [os.path.basename(x) for x in all_folders] all_folders = [int(x) if x.isnumeric() else -1 for x in all_folders] + [0] current_folder = max(all_folders) + 1 current_folder = os.path.join(img_path, str(current_folder)) os.makedirs(current_folder, exist_ok=True) print(f"Writing into {current_folder}") input("Press Enter...") env = DummyVecEnv([make_env(env_id=args.env_id, rank=0, seed=0, render=True)]) env = VecNormalize.load(args.vnorm_filename, env) model = CustomizedPPO.load(args.model_filename, env=env) callback = AdjustCameraCallback() obs = env.reset() callback.reset_lights(env.envs[0].env._p) # once window is opened, change the lighting if args.save_img: time.sleep(1) # please use this time to maximize the window, so that the image recorded will be full size with model.policy.features_extractor.start_testing(): while True: for i in range(1000): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) callback.camera_simpy_follow_robot(target_env=env.envs[0]) if args.save_img: callback.write_a_image(current_folder=current_folder, step=i, target_env=env.envs[0]) if obs.shape[1]>100: # With Vision I guess image = np.rollaxis(obs[:, -3*8*8:].reshape([3,8,8]), 0, start=3) * 255.0 print(image.shape) # image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) cv2.imwrite(f"{current_folder}/vision_{i:05}.png", image) if done: break time.sleep(0.01) break time.sleep(0.1) env.close()
def _maybe_normalize(self, env: VecEnv, eval_env: bool) -> VecEnv: """ Wrap the env into a VecNormalize wrapper if needed and load saved statistics when present. :param env: :param eval_env: :return: """ # Pretrained model, load normalization path_ = os.path.join(os.path.dirname(self.trained_agent), self.env_id) path_ = os.path.join(path_, "vecnormalize.pkl") if os.path.exists(path_): print("Loading saved VecNormalize stats") env = VecNormalize.load(path_, env) # Deactivate training and reward normalization if eval_env: env.training = False env.norm_reward = False elif self.normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = self.normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs["norm_reward"] = False else: local_normalize_kwargs = {"norm_reward": False} if self.verbose > 0: if len(local_normalize_kwargs) > 0: print(f"Normalization activated: {local_normalize_kwargs}") else: print("Normalizing input and reward") env.num_envs = self.n_envs env = VecNormalize(env, **local_normalize_kwargs) return env
def setup(args): bridge = Bridge() render_key = "renders" if 'CartPole' in args.env else "render" env_kwargs = { render_key: args.render, "adv_force": args.adv_force, "mass_percentage": args.mass_percentage, "friction_percentage": args.friction_percentage, "simple_reward": args.simple_reward, } env = make_vec_env(args.env, env_kwargs=env_kwargs, seed=args.seed, monitor_dir=args.monitor_dir) if args.evaluate: env = VecNormalize.load(f'{args.pickle}-{args.envname}', env) prot_agent = PPO.load(f'{args.pickle}-{args.prot_name}', device='cpu') if prot_agent.seed != args.seed: logging.info( f'warning: {prot_agent.seed=} does not match { args.seed=}') if args.adversarial: adv_agent = PPO.load(args.adv_pickle, device='cpu') if adv_agent.seed != args.seed: logging.info( f'warning: {adv_agent.seed=} does not match { args.seed=}') else: adv_agent = None else: env = VecNormalize(env) prot_logname = f'{args.logs}-{args.prot_name}' if args.logs else None prot_agent = PPO("MlpPolicy", env, verbose=args.verbose, seed=args.seed, tensorboard_log=prot_logname, n_steps=args.N_steps, is_protagonist=True, bridge=bridge, device='cpu') if args.adversarial: adv_logname = f'{args.logs}-{args.adv_name}' if args.logs else None adv_agent = PPO("MlpPolicy", env, verbose=args.verbose, seed=args.seed, tensorboard_log=adv_logname, n_steps=args.N_steps, is_protagonist=False, bridge=bridge, device='cpu') else: adv_agent = None bridge.link_agents(prot_agent, adv_agent) return prot_agent, adv_agent, env
def make_env( args, num_envs=None, include_norm=False, norm_reward=True, **kwargs, ): """Return a vectorized environment containing `num_envs` or `args.num_envs` environments (depending on whether `num_envs is None`). `args`, the command line arguments, specify several values. See `kwargs` for a more detailed explanation on their interaction. `include_norm` specifies whether the environment is wrapped in a normalizing environment. `norm_reward` indicates whether the rewards are normalized (only revelant if `include_norm is True`). `kwargs` are passed directly to the environment creation function. Any value given via `kwargs` has priority over the one given by `args`. """ if num_envs is None: num_envs = args.num_envs # `kwargs` given via `args` args_kwargs = {} for arg in [ 'M', 'dt', 'restol', 'lambda_real_interval', 'lambda_imag_interval', 'lambda_real_interpolation_interval', 'norm_factor', 'residual_weight', 'step_penalty', 'reward_iteration_only', 'reward_strategy', 'collect_states', 'example', ]: args_kwargs[arg] = kwargs.pop(arg, getattr(args, arg)) all_kwargs = {**kwargs, **args_kwargs} # SAC does not support float64 if args.model_class == 'SAC': all_kwargs['use_doubles'] = False seed = all_kwargs.pop('seed', args.seed) def gym_make(i): return lambda: gym.make( args.envname, seed=seed + i if seed is not None else None, **all_kwargs, ) env = DummyVecEnv([gym_make(i) for i in range(num_envs)]) if include_norm: if hasattr(args, 'env_path') and args.env_path is not None: env = VecNormalize.load(str(Path(args.env_path)), env) else: # When training, set `norm_reward = True`, I hear... if 'gamma' in args.model_kwargs: env = VecNormalize( env, norm_obs=args.norm_obs, norm_reward=norm_reward, gamma=args.model_kwargs['gamma'], ) else: env = VecNormalize( env, norm_obs=args.norm_obs, norm_reward=norm_reward, ) if debug_nans: env = VecCheckNan(env, raise_exception=True) return env
save_model_folder = 'trained_models' save_model_filename = '2M_OSC_POSE' load_model_folder = 'trained_models' load_model_filename = '2M_OSC_POSE' save_model_path = os.path.join(save_model_folder, save_model_filename) save_vecnormalize_path = os.path.join(save_model_folder, 'vec_normalize_' + save_model_filename + '.pkl') load_model_path = os.path.join(load_model_folder, load_model_filename) load_vecnormalize_path = os.path.join(load_model_folder, 'vec_normalize_' + load_model_filename + '.pkl') if training: env = SubprocVecEnv([make_training_env(env_id, options, i) for i in range(num_cpu)]) env = VecNormalize(env) if isinstance(load_model_for_training_path, str): env = VecNormalize.load(load_vecnormalize_for_training_path, env) model = PPO.load(load_model_for_training_path, env=env) else: model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log_folder) eval_env_func = make_training_env(env_id, options, rank=num_cpu) eval_env = DummyVecEnv([eval_env_func]) eval_env = VecNormalize(eval_env) eval_callback = EvalCallback(eval_env, best_model_save_path='./best_models/', log_path='./logs_best_model/', deterministic=True, render=False, n_eval_episodes=10) model.learn(total_timesteps=training_timesteps, tb_log_name=tb_log_name, callback=eval_callback) model.save(save_model_path)
def normalize_env( env, orig_log_dir, sb_version, vectorize=True, continue_learning=False, evaluate=False, evaluate_during_learning=False, normalize_kwargs=None, ): if vectorize: env = DummyVecEnv([lambda: env]) logger.debug("Normalize: {}".format(normalize_kwargs)) if evaluate: # FIXME in continue learning training should be True so that we update the running average of obs and # rewards with new samples; if I do that, the algo performs very poorly even with no changes in the env if sb_version == "sb3": env = VecNormalize3(env, training=False, **normalize_kwargs) else: env = VecNormalize(env, training=False, **normalize_kwargs) if not evaluate_during_learning or continue_learning: if not os.path.exists( os.path.join(orig_log_dir, "vecnormalize.pkl")): env_name = get_env_name(env=env.unwrapped, sb_version=sb_version) index_last_separator = orig_log_dir.rindex("/") new_orig_log_dir = os.path.join( orig_log_dir[0:index_last_separator], "logs_" + env_name) logger.debug( "{} does not exist. Trying to search it in the original model directory {}" .format(os.path.join(orig_log_dir, "vecnormalize.pkl"), new_orig_log_dir)) assert os.path.exists(new_orig_log_dir), "{} does not exist" assert os.path.exists( os.path.join(new_orig_log_dir, "vecnormalize.pkl")), ( os.path.join(new_orig_log_dir, "vecnormalize.pkl") + " does not exist") logger.debug("[evaluate] Loading {}".format( os.path.join(new_orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(new_orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(new_orig_log_dir, "vecnormalize.pkl"), env) else: logger.debug("[evaluate] Loading {}".format( os.path.join(orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False elif continue_learning: # FIXME: don't know why but during continue learning I have to disable training otherwise performance # is not the same as in the model trained from scratch even without changing the params of the environment. # in rl-baselines-zoo this is not done during continue learning: # https://github.com/araffin/rl-baselines-zoo/blob/master/train.py#L365 if sb_version == "sb3": env = VecNormalize3(env, training=False, **normalize_kwargs) else: env = VecNormalize(env, training=False, **normalize_kwargs) assert os.path.exists(os.path.join( orig_log_dir, "vecnormalize.pkl")), ( os.path.join(orig_log_dir, "vecnormalize.pkl") + " does not exist") logger.debug("[continue_learning] Loading {}".format( os.path.join(orig_log_dir, "vecnormalize.pkl"))) if sb_version == "sb3": env = VecNormalize3.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: env = VecNormalize.load( os.path.join(orig_log_dir, "vecnormalize.pkl"), env) else: if sb_version == "sb3": env = VecNormalize3(env, **normalize_kwargs) else: env = VecNormalize(env, **normalize_kwargs) return env
if 'policy_kwargs' in hyperparams.keys(): del hyperparams['policy_kwargs'] model = ALGOS[args.algo].load(args.trained_agent, env=env, seed=args.seed, tensorboard_log=tensorboard_log, verbose=args.verbose, **hyperparams) exp_folder = args.trained_agent.split('.zip')[0] if normalize: print("Loading saved running average") stats_path = os.path.join(exp_folder, env_id) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) else: # Legacy: env.load_running_average(exp_folder) replay_buffer_path = os.path.join(os.path.dirname(args.trained_agent), 'replay_buffer.pkl') if os.path.exists(replay_buffer_path): print("Loading replay buffer") model.load_replay_buffer(replay_buffer_path) elif args.optimize_hyperparameters: if args.verbose > 0: print("Optimizing hyperparameters")
if custom_params['USING_VAE']: env = NormalizeWrapper(env) # No need to use normalization if image env = FinalLayerObservationWrapper(env, latent_dim=1028, map="map3") # Step 3.b. To make Vectorized Environment to be able to use Normalize or FramStack (Optional) env = make_vec_env(lambda: env, n_envs=1) # Step 3.b Passing through Normalization and stack frame (Optional) env = VecFrameStack( env, n_stack=custom_params['FRAME_STACK']) # Use 1 for now because we use image if not custom_params['USING_VAE']: env = VecTransposeImage(env) # Uncomment if using 3d obs if custom_params['USING_NORMALIZATION']: env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"), env) # Load the agent if custom_params['algo'] == 'sac': model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'a2c': model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'dqn': model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'ppo': model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip")) else: raise ValueError("Error model") # Load the saved statistics
def main(args): policy_path = args.policy_path expert = PPO.load(policy_path) # Initialize environment for input standardization factory = EnvFactory(args.env) env = DummyVecEnv([factory.make_env]) env = VecNormalize.load(args.stats_path, env) env.training = False states = [] for i in np.arange(-10, 110): for j in np.arange(-3, 3, 0.05): states.append([i, j]) states = np.stack(states) states_scaled = env.normalize_obs(states) states_tensor = torch.as_tensor(states_scaled).float() policy: ActorCriticPolicy = expert.policy.cpu() true_actions_tensor, _, _ = policy.forward(states_tensor, deterministic=True) features_tensor = policy.features_extractor.forward(states_tensor) shared_latents_tensor = policy.mlp_extractor.shared_net.forward( features_tensor) policy_latents_tensor_layer1 = policy.mlp_extractor.policy_net[0].forward( shared_latents_tensor) policy_latents_tensor_layer1_activated = policy.mlp_extractor.policy_net[ 1].forward(policy_latents_tensor_layer1) policy_latents_tensor_layer2 = policy.mlp_extractor.policy_net[2].forward( policy_latents_tensor_layer1_activated) policy_latents_tensor_layer2_activated = policy.mlp_extractor.policy_net[ 3].forward(policy_latents_tensor_layer2) actions_tensor = policy.action_net.forward( policy_latents_tensor_layer2_activated) assert actions_tensor.equal(true_actions_tensor) binary_embeddings_layer1 = policy_latents_tensor_layer1_activated > 0 binary_embeddings_layer1 = binary_embeddings_layer1.cpu().detach().numpy() binary_embeddings_layer2 = policy_latents_tensor_layer2_activated > 0 binary_embeddings_layer2 = binary_embeddings_layer2.cpu().detach().numpy() binary_embeddings = np.concatenate( [binary_embeddings_layer1, binary_embeddings_layer2], axis=1).astype(int) integer_embeddings = np.packbits(binary_embeddings, axis=1, bitorder="little") integer_embeddings = integer_embeddings @ (256**np.arange( integer_embeddings.shape[1])) # to allow arbitrary number of bits # convert raw integer embeddings to 0, 1, 2, 3... # fast rendering of state cells via grid interpolation grid_x, grid_y = np.mgrid[-10:110:1000j, -3:3:1000j] z = griddata((states[:, 0], states[:, 1]), integer_embeddings, (grid_x, grid_y), method='nearest') # convert raw integer convert_raw_integer_to_colorhash = np.vectorize(lambda x: ColorHash(x).rgb) grid_z = np.array(convert_raw_integer_to_colorhash(z)).swapaxes( 0, 1).swapaxes(1, 2) plt.figure() plt.imshow(grid_z, extent=[-10, 110, -3, 3], aspect='auto') plt.title("State Space Visualized") plt.xlabel("$x$") plt.ylabel("$\\dot x$") plt.show()
def create_test_env(env_id, n_envs=1, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None, env_kwargs=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor :return: (gym.Env) """ # HACK to save logs # if log_dir is not None: # os.environ["OPENAI_LOG_FORMAT"] = 'csv' # os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) # os.makedirs(log_dir, exist_ok=True) # logger.configure() # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: # HACK: force SubprocVecEnv for Bullet env env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
else: # Join file paths continue_training_model_path = os.path.join( continue_training_model_folder, continue_training_model_filename) continue_training_vecnormalize_path = os.path.join( continue_training_model_folder, 'vec_normalize_' + continue_training_model_filename + '.pkl') print( f"Continual training on model located at {continue_training_model_path}" ) # Load normalized env env = VecNormalize.load(continue_training_vecnormalize_path, env) # Load model model = PPO.load(continue_training_model_path, env=env) # Training model.learn(total_timesteps=training_timesteps, tb_log_name=tb_log_name, callback=checkpoint_callback, reset_num_timesteps=True) # Save trained model model.save(save_model_path) env.save(save_vecnormalize_path) else:
def create_test_env( env_id: str, n_envs: int = 1, stats_path: Optional[str] = None, seed: int = 0, log_dir: Optional[str] = None, should_render: bool = True, hyperparams: Optional[Dict[str, Any]] = None, env_kwargs: Optional[Dict[str, Any]] = None, ) -> VecEnv: """ Create environment for testing a trained agent :param env_id: :param n_envs: number of processes :param stats_path: path to folder containing saved running averaged :param seed: Seed for random number generator :param log_dir: Where to log rewards :param should_render: For Pybullet env, display the GUI :param hyperparams: Additional hyperparams (ex: n_stack) :param env_kwargs: Optional keyword argument to pass to the env constructor :return: """ # Avoid circular import from utils.exp_manager import ExperimentManager # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) hyperparams = {} if hyperparams is None else hyperparams if "env_wrapper" in hyperparams.keys(): del hyperparams["env_wrapper"] vec_env_kwargs = {} vec_env_cls = DummyVecEnv if n_envs > 1 or (ExperimentManager.is_bullet(env_id) and should_render): # HACK: force SubprocVecEnv for Bullet env # as Pybullet envs does not follow gym.render() interface vec_env_cls = SubprocVecEnv # start_method = 'spawn' for thread safe env = make_vec_env( env_id, n_envs=n_envs, monitor_dir=log_dir, seed=seed, wrapper_class=env_wrapper, env_kwargs=env_kwargs, vec_env_cls=vec_env_cls, vec_env_kwargs=vec_env_kwargs, ) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams["normalize"]: print("Loading running average") print(f"with params: {hyperparams['normalize_kwargs']}") path_ = os.path.join(stats_path, "vecnormalize.pkl") if os.path.exists(path_): env = VecNormalize.load(path_, env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: raise ValueError(f"VecNormalize stats {path_} not found") n_stack = hyperparams.get("frame_stack", 0) if n_stack > 0: print(f"Stacking {n_stack} frames") env = VecFrameStack(env, n_stack) return env
def create_test_env( env_id, n_envs=1, stats_path=None, seed=0, log_dir="", should_render=True, hyperparams=None, env_kwargs=None ): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor :return: (gym.Env) """ # HACK to save logs # if log_dir is not None: # os.environ["OPENAI_LOG_FORMAT"] = 'csv' # os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) # os.makedirs(log_dir, exist_ok=True) # logger.configure() # Clean hyperparams, so the dict can be pass to the model constructor if True: keys_to_delete = ["n_envs", "n_timesteps", "env_wrapper", "callback", "frame_stack"] for key in keys_to_delete: delete_key(hyperparams, key) if n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv( [make_env(env_id, i, seed, log_dir, env_kwargs=env_kwargs) for i in range(n_envs)] ) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id or "Walker2D" in env_id: # HACK: force SubprocVecEnv for Bullet env env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)]) else: env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams["normalize"]: # print("Loading running average") # print("with params: {}".format(hyperparams["normalize_kwargs"])) path_ = os.path.join(stats_path, "vecnormalize.pkl") if os.path.exists(path_): env = VecNormalize.load(path_, env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: raise ValueError(f"VecNormalize stats {path_} not found") n_stack = hyperparams.get("frame_stack", 0) if n_stack > 0: print(f"Stacking {n_stack} frames") env = VecFrameStack(env, n_stack) return env