def main(config: str, agent: str): with open(config) as fp: json_data = json.load(fp) config = GameConfig.deserialize(json_data) log_dir = config.agents_config[agent]["save_path"] if agent == "DQN": env = make_atari_env(config.game_name, n_envs=1, seed=0, monitor_dir=log_dir) elif agent == "PPO": env = make_atari_env(config.game_name, n_envs=8, seed=0, monitor_dir=log_dir) else: env = make_atari_env(config.game_name, n_envs=16, seed=0, monitor_dir=log_dir) env = VecFrameStack(env, n_stack=4) agent = AgentLoader.get_agent(agent, config.agents_config, env) reward_callback = SaveOnBestTrainingRewardCallback( check_freq=100, log_dir=log_dir) start_time = time.time() steps = 10_000_000 with ProgressBarManager_new(steps) as progress_callback: agent.agent.learn(total_timesteps=steps, callback=[ reward_callback, progress_callback]) # agent.save() env.close() end_time = time.time() - start_time print(f'\n The Training Took {end_time} seconds')
def test_sync_vec_normalize(): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) assert isinstance(unwrap_vec_normalize(env), VecNormalize) env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) eval_env = VecFrameStack(eval_env, 1) env.reset() # Initialize running mean for _ in range(100): env.step([env.action_space.sample()]) obs = env.reset() original_obs = env.get_original_obs() dummy_rewards = np.random.rand(10) # Normalization must be different assert not np.allclose(obs, eval_env.normalize_obs(original_obs)) sync_envs_normalization(env, eval_env) # Now they must be synced assert np.allclose(obs, eval_env.normalize_obs(original_obs)) assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def test_sync_vec_normalize(make_env): env = DummyVecEnv([make_env]) assert unwrap_vec_normalize(env) is None env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) assert isinstance(unwrap_vec_normalize(env), VecNormalize) if not isinstance(env.observation_space, spaces.Dict): env = VecFrameStack(env, 1) assert isinstance(unwrap_vec_normalize(env), VecNormalize) eval_env = DummyVecEnv([make_env]) eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=100.0, clip_reward=100.0) if not isinstance(env.observation_space, spaces.Dict): eval_env = VecFrameStack(eval_env, 1) env.seed(0) env.action_space.seed(0) env.reset() # Initialize running mean latest_reward = None for _ in range(100): _, latest_reward, _, _ = env.step([env.action_space.sample()]) # Check that unnormalized reward is same as original reward original_latest_reward = env.get_original_reward() assert np.allclose(original_latest_reward, env.unnormalize_reward(latest_reward)) obs = env.reset() dummy_rewards = np.random.rand(10) original_obs = env.get_original_obs() # Check that unnormalization works assert allclose(original_obs, env.unnormalize_obs(obs)) # Normalization must be different (between different environments) assert not allclose(obs, eval_env.normalize_obs(original_obs)) # Test syncing of parameters sync_envs_normalization(env, eval_env) # Now they must be synced assert allclose(obs, eval_env.normalize_obs(original_obs)) assert allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
def train_simple_opponent(args): env_name = "WimblepongVisualBadAI-v0" env = gym.make(env_name) #env = ParallelEnvs(env_name, processes=4, envs_per_process=1) env = SubprocVecEnv( [make_env(env_name, args.seed + i) for i in range(args.num_envs)], start_method="spawn") env = VecFrameStack(env, n_stack=4) if args.algorithm.lower() == "dqn": agent = DQNagent.Agent(env_name, env.observation_space, env.action_space) elif args.algorithm.lower() == "ppo": agent = ppo_agent_stack_4.Agent() agent.init_memory(args.steps_per_env, args.num_envs) agent.is_training = True if args.checkpoint: agent.load_checkpoint() elif args.pretrained_model: agent.load_model() else: raise NotImplementedError( f"No such algorithm: {args.algorithm.lower()}") train(env, agent, args) agent.save_policy() env.close()
def make_env(seed: int, n_envs: int, run_dir: str, frame_skip: int, frame_stack: int, is_eval: bool = False) -> VecEnv: """ Makes vectorized env with required wrappers :param seed: Random seed :param n_envs: Number of environment to run in parallel :param run_dir: Run directory :param frame_skip: Skip every nth frame :param frame_stack: Stack n frames together :param is_eval: True if used for evaluation :return: Vectorized env """ if n_envs == 1: env = DummyVecEnv([_env_fn(seed, run_dir, frame_skip, is_eval)]) else: env = SubprocVecEnv([ _env_fn(seed + i, run_dir, frame_skip, is_eval) for i in range(n_envs) ]) if frame_stack > 0: return VecFrameStack(env, n_stack=4) else: return env
def test_vec_transpose_skip(tmp_path, model_class): # Fake grayscale with frameskip env = FakeImageEnv(screen_height=41, screen_width=40, n_channels=10, discrete=model_class not in {SAC, TD3}, channel_first=True) env = DummyVecEnv([lambda: env]) # Stack 5 frames so the observation is now (50, 40, 40) but the env is still channel first env = VecFrameStack(env, 5, channels_order="first") obs_shape_before = env.reset().shape # The observation space should be different as the heuristic thinks it is channel last assert not np.allclose(obs_shape_before, VecTransposeImage(env).reset().shape) env = VecTransposeImage(env, skip=True) # The observation space should be the same as we skip the VecTransposeImage assert np.allclose(obs_shape_before, env.reset().shape) kwargs = dict( n_steps=64, policy_kwargs=dict(features_extractor_kwargs=dict(features_dim=32)), seed=1, ) model = model_class("CnnPolicy", env, **kwargs).learn(250) obs = env.reset() action, _ = model.predict(obs, deterministic=True)
def eval_env_constructor(n_envs=1): """ Evaluation should be in a scalar environment. """ env = make_atari_env("MontezumaRevenge-v0", n_envs=n_envs) env = VecFrameStack(env, n_stack=4) env = ScalarizeEnvWrapper(env) return env
def create_environment(config): if config.atari_wrapper: env = make_atari_env(config.environment, n_envs=config.workers) env = VecFrameStack(env, n_stack = 1) else: env = make_vec_env(config.environment, n_envs=config.workers) env = DummyEnvWrapper(env, config.add_stoch) return env
def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) -> VecEnv: """ Create the environment and wrap it if necessary. :param n_envs: :param eval_env: Whether is it an environment used for evaluation or not :param no_log: Do not log training when doing hyperparameter optim (issue with writing the same file) :return: the vectorized environment, with appropriate wrappers """ # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else self.save_path # env = SubprocVecEnv([make_env(env_id, i, self.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = make_vec_env( env_id=self.env_id, n_envs=n_envs, seed=self.seed, env_kwargs=self.env_kwargs, monitor_dir=log_dir, wrapper_class=self.env_wrapper, vec_env_cls=self.vec_env_class, vec_env_kwargs=self.vec_env_kwargs, ) # Special case for GoalEnvs: log success rate too if "Neck" in self.env_id or self.is_robotics_env(self.env_id): self._log_success_rate(env) # Wrap the env into a VecNormalize wrapper if needed # and load saved statistics when present env = self._maybe_normalize(env, eval_env) # Optional Frame-stacking if self.frame_stack is not None: n_stack = self.frame_stack env = VecFrameStack(env, n_stack) if self.verbose > 0: print(f"Stacking {n_stack} frames") # Wrap if needed to re-order channels # (switch from channel last to channel first convention) if is_image_space(env.observation_space): if self.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) # check if wrapper for dict support is needed if self.algo == "her": if self.verbose > 0: print("Wrapping into a ObsDictWrapper") env = ObsDictWrapper(env) return env
def atari_make(env_name, scalarize=True, **kwargs): from stable_baselines3.common.env_util import make_atari_env from stable_baselines3.common.vec_env import VecFrameStack env = make_atari_env(env_id=env_name, **kwargs) env = VecFrameStack(env, n_stack=4) if scalarize: from rlberry.wrappers.scalarize import ScalarizeEnvWrapper env = ScalarizeEnvWrapper(env) return env
def get_env(): env = make_atari_env(atari_env_name('pong', 'image', 'v4', no_frame_skip=True), n_envs=4, seed=0) env = VecFrameStack(env, n_stack=4) return env
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path if n_envs == 1: env = SubprocVecEnv( [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)] ) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = SubprocVecEnv( [ make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper) for i in range(n_envs) ] ) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs["norm_reward"] = False else: local_normalize_kwargs = {"norm_reward": False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print(f"Normalization activated: {local_normalize_kwargs}") else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get("frame_stack", False): n_stack = hyperparams["frame_stack"] env = VecFrameStack(env, n_stack) print(f"Stacking {n_stack} frames") if is_image_space(env.observation_space): if args.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def atari_env(num_envs=1): def env_fn(): env = gym.make("SpaceInvadersNoFrameskip-v4") env = AtariWrapper(env) return env env = DummyVecEnv([env_fn] * num_envs) env = VecFrameStack(env, 4) env = VecTransposeImage(env) env = VecNormalize(env) return env
def create_env(n_envs, eval_env=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env else save_path if n_envs == 1: env = DummyVecEnv([ make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs) ]) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = DummyVecEnv([ make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper) for i in range(n_envs) ]) if normalize: if args.verbose > 0: if len(normalize_kwargs) > 0: print(f"Normalization activated: {normalize_kwargs}") else: print("Normalizing input and reward") env = VecNormalize(env, **normalize_kwargs) # Optional Frame-stacking if hyperparams.get('frame_stack', False): n_stack = hyperparams['frame_stack'] env = VecFrameStack(env, n_stack) print(f"Stacking {n_stack} frames") if is_image_space(env.observation_space): if args.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def make_env(env_id, n_envs, frame_stack = True, clip_reward = False, terminal_on_life_loss = False, monitor_dir = './log/monitors', vec_env_cls = SubprocVecEnv): wrapper_kwargs = {'terminal_on_life_loss':terminal_on_life_loss, 'clip_reward':clip_reward} env = make_atari_env(env_id, n_envs, monitor_dir = monitor_dir, vec_env_cls = vec_env_cls, wrapper_kwargs = wrapper_kwargs) env = VecFrameStack(env, 4) env = VecTransposeImage(env)
def create_vectorized_environment( n_envs: int, frame_stack: int, env_creation_func: t.Callable) -> VecTransposeImage: """Creates a vectorized environment for image-based models. :param n_envs: The number of parallel environment to run. :param frame_stack: The number of frame to stack in each environment. :param env_creation_func: A callable returning a Gym environment. :return: A vectorized environment with frame stacking and image transposition. """ return VecTransposeImage( VecFrameStack(SubprocVecEnv([env_creation_func] * n_envs), frame_stack))
def create_envs(self, n_envs: int, eval_env: bool = False, no_log: bool = False) -> VecEnv: """ Create the environment and wrap it if necessary. :param n_envs: :param eval_env: Whether is it an environment used for evaluation or not :param no_log: Do not log training when doing hyperparameter optim (issue with writing the same file) :return: the vectorized environment, with appropriate wrappers """ # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else self.save_path monitor_kwargs = {} # Special case for GoalEnvs: log success rate too if "Neck" in self.env_id or self.is_robotics_env(self.env_id) or "parking-v0" in self.env_id: monitor_kwargs = dict(info_keywords=("is_success",)) # On most env, SubprocVecEnv does not help and is quite memory hungry # therefore we use DummyVecEnv by default env = make_vec_env( env_id=self.env_id, n_envs=n_envs, seed=self.seed, env_kwargs=self.env_kwargs, monitor_dir=None, # Avoid useless monitor file spam from plotting wrapper_class=self.env_wrapper, vec_env_cls=self.vec_env_class, vec_env_kwargs=self.vec_env_kwargs, monitor_kwargs=monitor_kwargs, ) # Wrap the env into a VecNormalize wrapper if needed # and load saved statistics when present env = self._maybe_normalize(env, eval_env) # Optional Frame-stacking if self.frame_stack is not None: n_stack = self.frame_stack env = VecFrameStack(env, n_stack) if self.verbose > 0: print(f"Stacking {n_stack} frames") # Wrap if needed to re-order channels # (switch from channel last to channel first convention) if is_image_space(env.observation_space) and not is_image_space_channels_first(env.observation_space): if self.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def test_dict_vec_framestack(model_class, channel_last): """ Additional tests to check observation space support for Dictionary spaces and VecEnvWrapper using MultiInputPolicy. """ use_discrete_actions = model_class not in [TQC] channels_order = {"vec": None, "img": "last" if channel_last else "first"} env = DummyVecEnv([ lambda: SimpleMultiObsEnv(random_start=True, discrete_actions=use_discrete_actions, channel_last=channel_last) ]) env = VecFrameStack(env, n_stack=3, channels_order=channels_order) kwargs = {} n_steps = 256 if model_class in {}: kwargs = dict( n_steps=128, policy_kwargs=dict( net_arch=[32], features_extractor_kwargs=dict(cnn_output_dim=32), ), ) else: # Avoid memory error when using replay buffer # Reduce the size of the features and make learning faster kwargs = dict( buffer_size=250, policy_kwargs=dict( net_arch=[32], features_extractor_kwargs=dict(cnn_output_dim=32), n_quantiles=20, ), train_freq=8, gradient_steps=1, ) if model_class == QRDQN: kwargs["learning_starts"] = 0 model = model_class("MultiInputPolicy", env, gamma=0.5, seed=1, **kwargs) model.learn(total_timesteps=n_steps) evaluate_policy(model, env, n_eval_episodes=5, warn=False)
def make_atari_default( env_id: Union[str, Type[gym.Env]], n_envs: int = 1, seed: Optional[int] = None, start_index: int = 0, monitor_dir: Optional[str] = None, wrapper_kwargs: Optional[Dict[str, Any]] = None, env_kwargs: Optional[Dict[str, Any]] = None, vec_env_cls: Optional[Union[DummyVecEnv, SubprocVecEnv]] = DummyVecEnv, vec_env_kwargs: Optional[Dict[str, Any]] = None, ) -> VecEnv: """ Create a wrapped, monitored VecEnv for Atari. It is a wrapper around ``make_vec_env`` that includes common preprocessing for Atari games. :param env_id: the environment ID or the environment class :param n_envs: the number of environments you wish to have in parallel :param seed: the initial seed for the random number generator :param start_index: start rank index :param monitor_dir: Path to a folder where the monitor files will be saved. If None, no file will be written, however, the env will still be wrapped in a Monitor wrapper to provide additional information about training. :param wrapper_kwargs: Optional keyword argument to pass to the ``AtariWrapper`` :param env_kwargs: Optional keyword argument to pass to the env constructor :param vec_env_cls: A custom ``VecEnv`` class constructor. Default: None. :param vec_env_kwargs: Keyword arguments to pass to the ``VecEnv`` class constructor. :return: The wrapped environment """ if wrapper_kwargs is None: wrapper_kwargs = {} def atari_wrapper(env: gym.Env) -> gym.Env: env = AtariWrapper(env, **wrapper_kwargs) return env return VecFrameStack(make_vec_env_fix( env_id, n_envs=n_envs, seed=seed, start_index=start_index, monitor_dir=monitor_dir, wrapper_class=atari_wrapper, env_kwargs=env_kwargs, vec_env_cls=vec_env_cls, vec_env_kwargs=vec_env_kwargs, ), n_stack=4)
def atari_games_example(): # There already exists an environment generator that will make and wrap atari environments correctly. # Here we are also multi-worker training (n_envs=4 => 4 environments). env = make_atari_env("PongNoFrameskip-v4", n_envs=4, seed=0) # Frame-stacking with 4 frames. env = VecFrameStack(env, n_stack=4) model = A2C("CnnPolicy", env, verbose=1) model.learn(total_timesteps=25_000) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def test_vec_env_is_wrapped(): # Test is_wrapped call of subproc workers def make_env(): return CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2))) def make_monitored_env(): return Monitor( CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2)))) # One with monitor, one without vec_env = SubprocVecEnv([make_env, make_monitored_env]) assert vec_env.env_is_wrapped(Monitor) == [False, True] vec_env.close() # One with monitor, one without vec_env = DummyVecEnv([make_env, make_monitored_env]) assert vec_env.env_is_wrapped(Monitor) == [False, True] vec_env = VecFrameStack(vec_env, n_stack=2) assert vec_env.env_is_wrapped(Monitor) == [False, True]
def run_dqn_baseline(): env = make_atari_env('BreakoutNoFrameskip-v4', n_envs=1, seed=0) env = VecFrameStack(env, n_stack=4) tensorboard_log = os.path.join(os.path.dirname(__file__), 'runs_baseline') buffer_size = 100000 num_training_steps = 1000000 model = DQN('CnnPolicy', env, verbose=0, buffer_size=buffer_size, learning_starts=50000, optimize_memory_usage=False, tensorboard_log=tensorboard_log) model.learn(total_timesteps=num_training_steps) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ print("Did you even try?") eval_env = make_atari_env(env_id, n_envs=nEnv, seed=0) eval_env = VecFrameStack(eval_env, n_stack=4) obs = eval_env.reset() for _ in range(video_length): action, _states = model.predict(obs) obs, rewards, dones, info = eval_env.step(action) eval_env.render()
def create_test_env( env_id: str, n_envs: int = 1, stats_path: Optional[str] = None, seed: int = 0, log_dir: Optional[str] = None, should_render: bool = True, hyperparams: Optional[Dict[str, Any]] = None, env_kwargs: Optional[Dict[str, Any]] = None, ) -> VecEnv: """ Create environment for testing a trained agent :param env_id: :param n_envs: number of processes :param stats_path: path to folder containing saved running averaged :param seed: Seed for random number generator :param log_dir: Where to log rewards :param should_render: For Pybullet env, display the GUI :param hyperparams: Additional hyperparams (ex: n_stack) :param env_kwargs: Optional keyword argument to pass to the env constructor :return: """ # Avoid circular import from utils.exp_manager import ExperimentManager # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) hyperparams = {} if hyperparams is None else hyperparams if "env_wrapper" in hyperparams.keys(): del hyperparams["env_wrapper"] vec_env_kwargs = {} vec_env_cls = DummyVecEnv if n_envs > 1 or (ExperimentManager.is_bullet(env_id) and should_render): # HACK: force SubprocVecEnv for Bullet env # as Pybullet envs does not follow gym.render() interface vec_env_cls = SubprocVecEnv # start_method = 'spawn' for thread safe env = make_vec_env( env_id, n_envs=n_envs, monitor_dir=log_dir, seed=seed, wrapper_class=env_wrapper, env_kwargs=env_kwargs, vec_env_cls=vec_env_cls, vec_env_kwargs=vec_env_kwargs, ) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams["normalize"]: print("Loading running average") print(f"with params: {hyperparams['normalize_kwargs']}") path_ = os.path.join(stats_path, "vecnormalize.pkl") if os.path.exists(path_): env = VecNormalize.load(path_, env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: raise ValueError(f"VecNormalize stats {path_} not found") n_stack = hyperparams.get("frame_stack", 0) if n_stack > 0: print(f"Stacking {n_stack} frames") env = VecFrameStack(env, n_stack) return env
# Effective code with PPO package import gym from stable_baselines3.common.env_util import make_atari_env from stable_baselines3.common.vec_env import VecFrameStack from stable_baselines3 import PPO env = make_atari_env('Assault-v0', n_envs=1, seed=0) env = VecFrameStack(env, n_stack=1) model = PPO('MlpPolicy', env, verbose=1, tensorboard_log='./PPO_log/') model.learn(total_timesteps=int(3e4)) obs = env.reset() obs_ = obs.transpose(3, 0, 1, 2) while True: action, _states = model.predict(obs_) obs, rewards, dones, info = env.step(action) env.render()
def create_test_env(env_id, n_envs=1, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None, env_kwargs=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor :return: (gym.Env) """ # HACK to save logs # if log_dir is not None: # os.environ["OPENAI_LOG_FORMAT"] = 'csv' # os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) # os.makedirs(log_dir, exist_ok=True) # logger.configure() # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: # HACK: force SubprocVecEnv for Bullet env env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
env = RewardWrapper(env) env = ResizeWrapper(env, shape=(64, 80, 3)) if custom_params['algo'] == 'dqn': env = DiscreteWrapper(env) if custom_params['USING_VAE']: env = NormalizeWrapper(env) # No need to use normalization if image env = FinalLayerObservationWrapper(env, latent_dim=1028, map="map3") # Step 3.b. To make Vectorized Environment to be able to use Normalize or FramStack (Optional) env = make_vec_env(lambda: env, n_envs=1) # Step 3.b Passing through Normalization and stack frame (Optional) env = VecFrameStack( env, n_stack=custom_params['FRAME_STACK']) # Use 1 for now because we use image if not custom_params['USING_VAE']: env = VecTransposeImage(env) # Uncomment if using 3d obs if custom_params['USING_NORMALIZATION']: env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"), env) # Load the agent if custom_params['algo'] == 'sac': model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'a2c': model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'dqn': model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip")) elif custom_params['algo'] == 'ppo':
obs, rewards, dones, info = eval_env.step(action) eval_env.render() # Close the video recorder # eval_env.close() # Stack 4 frames env_id = 'PongNoFrameskip-v4' video_folder = 'logs/videos/' video_length = 1000 nEnv = 8 startFresh = False if (startFresh): env = make_atari_env(env_id, n_envs=nEnv, seed=0) env = VecFrameStack(env, n_stack=4) env.reset() model = A2C('CnnPolicy', env, verbose=1) model.learn(total_timesteps=25000) model.save("a2c_pong_{}".format(model.num_timesteps)) record_video(env_id, model, video_length=500, prefix='ac2_' + env_id, video_folder='videos/') else: env = make_atari_env(env_id, n_envs=nEnv, seed=0) env = VecFrameStack(env, n_stack=4) env.reset() trained_model = A2C.load("a2c_pong_200000", verbose=1) trained_model.set_env(env)
def run(policy, envname, learning_rate, n_steps, epochs, gamma, gae_lambda, ent_coef, vf_coef, max_grad_norm, normalize_advantage, policy_kwargs, n_eval_episodes, eval_freq, n_envs, n_stack, total_timesteps, log_interval, device="cuda", verbose=True, tensorboard_log="logs/"): # Normalize with multi environments seed = np.random.randint(1, 2**16) all_args = locals() path = "/" + os.path.join(*sb3.__file__.split("/")[:-2]) commit_num = subprocess.check_output(["git", "describe", "--always"], cwd=path).strip().decode() env = make_atari_env(envname, n_envs=n_envs, seed=seed) env = VecFrameStack(env, n_stack=n_stack) # Callbacks loggercallback = LoggerCallback("json", [("arguments", all_args), ("git", commit_num)]) # No seed as the evaluation has no effect on training or pruning evalcallback = EvalCallback(make_atari_env(envname, vec_env_cls=SubprocVecEnv), n_eval_episodes=n_eval_episodes, eval_freq=eval_freq) # Initiate the model and start learning model = A2C(policy, env, learning_rate, n_steps, epochs, gamma, gae_lambda, ent_coef, vf_coef, max_grad_norm, normalize_advantage, policy_kwargs, verbose=verbose, tensorboard_log=tensorboard_log, seed=seed, device="cuda") model.learn( total_timesteps=total_timesteps, log_interval=log_interval, callback=[loggercallback, evalcallback], tb_log_name=envname, ) model.env.close() evalcallback.eval_env.close() return evalcallback.best_mean_reward
def env_constructor(n_envs=4): env = make_atari_env("MontezumaRevenge-v0", n_envs=n_envs) env = VecFrameStack(env, n_stack=4) return env