def test_custom_vec_env(tmp_path): """ Stand alone test for a special case (passing a custom VecEnv class) to avoid doubling the number of tests. """ monitor_dir = tmp_path / "test_make_vec_env/" env = make_vec_env( "CartPole-v1", n_envs=1, monitor_dir=monitor_dir, seed=0, vec_env_cls=SubprocVecEnv, vec_env_kwargs={"start_method": None}, ) assert env.num_envs == 1 assert isinstance(env, SubprocVecEnv) assert os.path.isdir(monitor_dir) # Kill subprocess env.close() # Cleanup folder shutil.rmtree(monitor_dir) # This should fail because DummyVecEnv does not have any keyword argument with pytest.raises(TypeError): make_vec_env("CartPole-v1", n_envs=1, vec_env_kwargs={"dummy": False})
def main(): base_args, base_parser = get_logger2_args() args = get_args(base_parser) args.device = init_gpus_and_randomness(args.seed, args.gpu) logger = Logger2('/tmp/tmp', use_tensorboardX=True) logger.log_tb_object(args, 'args') envs = make_vec_env(args.env_name, n_envs=args.num_envs, vec_env_cls=SubprocVecEnv) viz_env = None if args.visualize: nm_core, nm_vrsn, = args.env_name.split('-') nm_core += 'Viz' if args.visualize else 'Dbg' if args.debug else '' viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1) rl_learner = PPO('MlpPolicy', envs, verbose=1, seed=args.seed, device='cpu') for epoch in range(args.num_epochs): rl_learner.learn(args.steps_per_epoch) if args.visualize: obs = viz_env.reset() done = False while not done: act, _ = rl_learner.predict(obs) if len(act.shape) > len(viz_env.action_space.shape): act = act[0:1] # just one viz env obs, rwd, done, _ = viz_env.step(act) time.sleep(0.01) # to make motions visible
def main(args): envs = make_vec_env(args.env_name, n_envs=args.num_envs, vec_env_cls=SubprocVecEnv) viz_env = None if args.viz: nm_core, nm_vrsn, = args.env_name.split('-') nm_core += 'Viz' if args.viz else 'Dbg' if args.debug else '' viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1) rl_learner = PPO('MlpPolicy', envs, verbose=1, seed=args.seed, device='cpu') for epoch in range(args.num_epochs): rl_learner.learn(args.steps_per_epoch) if args.viz: obs = viz_env.reset() done = False while not done: act, _ = rl_learner.predict(obs) if len(act.shape) > len(viz_env.action_space.shape): act = act[0:1] # just one viz env obs, rwd, done, _ = viz_env.step(act) time.sleep(0.01) # to make motions visible
def __init__(self, args): self.envName = args.envName self.numEnvs = args.numEnvs self.algo = args.algo self.timeSteps = args.timeSteps self.saveDir = args.saveDir # if self.algo in ['PPO','A2C']: self.env = make_vec_env(self.envName, n_envs=self.numEnvs)
def main(): args = parse_arguments() load_path = os.path.join("logs", args.env, args.agent, "best_model.zip") stats_path = os.path.join(args.log_dir, args.env, args.agent, "vec_normalize.pkl") if args.agent == 'ddpg': from stable_baselines3 import DDPG model = DDPG.load(load_path) elif args.agent == 'td3': from stable_baselines3 import TD3 model = TD3.load(load_path) elif args.agent == 'ppo': from stable_baselines3 import PPO model = PPO.load(load_path) env = make_vec_env(args.env, n_envs=1) env = VecNormalize.load(stats_path, env) # do not update them at test time env.training = False # reward normalization is not needed at test time env.norm_reward = False # env = gym.make(args.env) img = [] if args.render: env.render('human') done = False obs = env.reset() action = model.predict(obs) if args.gif: img.append(env.render('rgb_array')) if args.timesteps is None: while not done: action, _= model.predict(obs) obs, reward, done, info = env.step(action) if args.gif: img.append(env.render('rgb_array')) else: env.render() else: for i in range(args.timesteps): action, _= model.predict(obs) obs, reward, done, info = env.step(action) if args.gif: img.append(env.render('rgb_array')) else: env.render() if args.gif: imageio.mimsave(f'{os.path.join("logs", args.env, args.agent, "recording.gif")}', [np.array(img) for i, img in enumerate(img) if i%2 == 0], fps=29)
def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class): env = make_vec_env(env_id, n_envs, vec_env_cls=vec_env_cls, wrapper_class=wrapper_class, monitor_dir=None, seed=0) assert env.num_envs == n_envs if vec_env_cls is None: assert isinstance(env, DummyVecEnv) if wrapper_class is not None: assert isinstance(env.envs[0], wrapper_class) else: assert isinstance(env.envs[0], Monitor) else: assert isinstance(env, SubprocVecEnv) # Kill subprocesses env.close()
def __init__(self, rap, log_dir="/tmp/gym", training_config=None, algorithm="A2C", checkpoint_results=None): super(ResourceManager, self).__init__(rap, log_dir=log_dir, algorithm=algorithm, checkpoint_results=checkpoint_results) self.model_name = rap["name"] + "_baseline" self.environment = ResourceAllocationEnvironment(self.ra_problem) check_env(self.environment, warn=True) self.vector_environment = make_vec_env(lambda: self.environment, n_envs=1, monitor_dir=self.log_dir) self.training_steps = training_config["stage1_training_steps"]
def train(config, checkpoint_dir=None): if model_keys: for key, path in zip(model_keys, model_paths): submodel = PPO.load(path) environment_kwargs["lower_lvl_models"][key] = submodel if "stage1_models" in policy_kwargs: for path in model_paths: stage1_model = PPO.load(path) policy_kwargs["stage1_models"].append(stage1_model) environment = environment_class(ra_problem, **environment_kwargs) vector_environment = make_vec_env(lambda: environment, n_envs=1, monitor_dir=log_dir) rewards = [] for n in range(hpsearch_iterations): model = PPO(policy, vector_environment, verbose=1, tensorboard_log=log_dir, learning_rate=config["learning_rate"], ent_coef=config["ent_coef"], max_grad_norm=config["max_grad_norm"], policy_kwargs=policy_kwargs) model.learn(total_timesteps=training_steps) reward = evaluate_policy(model, vector_environment, n_eval_episodes=100)[0] rewards.append(reward) tune.report(reward=np.mean(rewards), std=np.std(rewards))
def test_callbacks(tmp_path, model_class): log_folder = tmp_path / "logs/callbacks/" # DQN only support discrete actions env_name = select_env(model_class) # Create RL model # Small network for fast test model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32])) checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder) eval_env = gym.make(env_name) # Stop training if the performance is good enough callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1) eval_callback = EvalCallback( eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100 ) # Equivalent to the `checkpoint_callback` # but here in an event-driven manner checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event") event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event) # Stop training if max number of episodes is reached callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1) callback = CallbackList([checkpoint_callback, eval_callback, event_callback, callback_max_episodes]) model.learn(500, callback=callback) # Check access to local variables assert model.env.observation_space.contains(callback.locals["new_obs"][0]) # Check that the child callback was called assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"] assert event_callback.locals["new_obs"] is callback.locals["new_obs"] assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"] # Check that internal callback counters match models' counters assert event_callback.num_timesteps == model.num_timesteps assert event_callback.n_calls == model.num_timesteps model.learn(500, callback=None) # Transform callback into a callback list automatically model.learn(500, callback=[checkpoint_callback, eval_callback]) # Automatic wrapping, old way of doing callbacks model.learn(500, callback=lambda _locals, _globals: True) # Testing models that support multiple envs if model_class in [A2C, PPO]: max_episodes = 1 n_envs = 2 # Pendulum-v0 has a timelimit of 200 timesteps max_episode_length = 200 envs = make_vec_env(env_name, n_envs=n_envs, seed=0) model = model_class("MlpPolicy", envs, policy_kwargs=dict(net_arch=[32])) callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=max_episodes, verbose=1) callback = CallbackList([callback_max_episodes]) model.learn(1000, callback=callback) # Check that the actual number of episodes and timesteps per env matches the expected one episodes_per_env = callback_max_episodes.n_episodes // n_envs assert episodes_per_env == max_episodes timesteps_per_env = model.num_timesteps // n_envs assert timesteps_per_env == max_episode_length if os.path.exists(log_folder): shutil.rmtree(log_folder)
print( "[ERROR] The selected algorithm does not support multiple environments" ) exit() #### Uncomment to debug slurm scripts ###################### # exit() env_name = ARGS.env + "-aviary-v0" sa_env_kwargs = dict(aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act) # train_env = gym.make(env_name, aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act) # single environment instead of a vectorized one if env_name == "takeoff-aviary-v0": train_env = make_vec_env(TakeoffAviary, env_kwargs=sa_env_kwargs, n_envs=ARGS.cpu, seed=0) if env_name == "hover-aviary-v0": train_env = make_vec_env(HoverAviary, env_kwargs=sa_env_kwargs, n_envs=ARGS.cpu, seed=0) if env_name == "flythrugate-aviary-v0": train_env = make_vec_env(FlyThruGateAviary, env_kwargs=sa_env_kwargs, n_envs=ARGS.cpu, seed=0) print("[INFO] Action space:", train_env.action_space) print("[INFO] Observation space:", train_env.observation_space) # check_env(train_env, warn=True, skip_render_check=True)
def run(train_freq, gradient_steps, batch_size, envname, n_envs, log_interval, learning_rate, buffer_size, tau, gamma, target_policy_noise, target_noise_clip, learning_starts, total_timesteps, policy_kwargs, action_noise_mean, action_noise_sigma, noise_type, eval_freq, n_eval_episodes, verbose=True, tensorboard_log="logs/"): # Normalize with multi environments eval_freq = max(eval_freq // n_envs, 1) buffer_size = max(buffer_size // n_envs, 1) all_args = locals() path = "/" + os.path.join(*sb3.__file__.split("/")[:-2]) commit_num = subprocess.check_output(["git", "describe", "--always"], cwd=path).strip().decode() env = gym.make(envname) vecenv = make_vec_env(envname, vec_env_cls=SubprocVecEnv, n_envs=n_envs) # The noise objects for DDPG n_actions = env.action_space.shape[-1] if noise_type == "OU": base_noise_class = OrnsteinUhlenbeckActionNoise elif noise_type == "Normal": base_noise_class = NormalActionNoise base_noise = base_noise_class(mean=np.ones(n_actions) * action_noise_mean, sigma=action_noise_sigma * np.ones(n_actions)) action_noise = VectorizedActionNoise(base_noise, vecenv.num_envs) # Callbacks loggercallback = LoggerCallback("json", [("arguments", all_args), ("git", commit_num)]) evalcallback = EvalCallback(make_vec_env(envname, vec_env_cls=SubprocVecEnv), n_eval_episodes=n_eval_episodes, eval_freq=eval_freq) # Initiate the model and start learning model = TD3("MlpPolicy", vecenv, action_noise=action_noise, batch_size=batch_size, train_freq=train_freq, gradient_steps=gradient_steps, learning_starts=learning_starts, n_episodes_rollout=-1, learning_rate=learning_rate, buffer_size=buffer_size, tau=tau, gamma=gamma, create_eval_env=True, target_policy_noise=target_policy_noise, target_noise_clip=target_noise_clip, verbose=verbose, policy_kwargs=policy_kwargs, tensorboard_log=tensorboard_log, device="cuda") model.learn( total_timesteps=total_timesteps, log_interval=log_interval, callback=[loggercallback, evalcallback], tb_log_name=envname, ) model.env.close() evalcallback.eval_env.close() return evalcallback.best_mean_reward
if ARGS.algo in ['sac', 'td3', 'ddpg'] and ARGS.cpu != 1: print( "[ERROR] The selected algorithm does not support multiple environments" ) exit() #### Uncomment to debug slurm scripts ############################################################## # exit() env_name = ARGS.env + "-aviary-v0" # train_env = gym.make(env_name, aggregate_phy_steps=AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act) # single environment instead of a vectorized one if env_name == "takeoff-aviary-v0": train_env = make_vec_env(TakeoffAviary, env_kwargs=dict( aggregate_phy_steps=AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act), n_envs=ARGS.cpu, seed=0) if env_name == "hover-aviary-v0": train_env = make_vec_env(HoverAviary, env_kwargs=dict( aggregate_phy_steps=AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act), n_envs=ARGS.cpu, seed=0) if env_name == "flythrugate-aviary-v0": train_env = make_vec_env(FlyThruGateAviary, env_kwargs=dict( aggregate_phy_steps=AGGR_PHY_STEPS,
def test_vec_env_kwargs(): env = make_vec_env("MountainCarContinuous-v0", n_envs=1, seed=0, env_kwargs={"goal_velocity": 0.11}) assert env.get_attr("goal_velocity")[0] == 0.11
# LICENSE file in the root directory of this source tree. # Change stable_baseline to stable_basleine3 if you are using the newer version from ABC_Env_CSC2547 import ABCEnv import gym import numpy as np env = ABCEnv() from stable_baselines3 import DQN, PPO, A2C from stable_baselines3.common.cmd_util import make_vec_env from stable_baselines3.common.evaluation import evaluate_policy # Instantiate the env env = ABCEnv() # wrap it env = make_vec_env(lambda: env, n_envs=1) # Train the agent """ Something you might want to play around with, learning_rate, total timesteps etc.. Always choose a sample efficient algorithm """ total_timesteps = 200 model = DQN('MlpPolicy', env, verbose=1, tensorboard_log="./CSC2547_tensorboard/") model.learn(total_timesteps) model_name = "DQN_timesteps_" + str(total_timesteps) model.save(model_name)
def make_env(env_id, n_envs, vec_env_cls=SubprocVecEnv): env = make_vec_env(env_id, n_envs, vec_env_cls=SubprocVecEnv) env = VecNormalize(env, norm_reward=True) return env
import os import pybullet_envs from pybullet_envs.stable_baselines.utils import TimeFeatureWrapper from torch import nn from stable_baselines3 import PPO from stable_baselines3.common.cmd_util import make_vec_env from stable_baselines3.common.vec_env import VecNormalize env = make_vec_env("HopperBulletEnv-v0", n_envs=8, wrapper_class=TimeFeatureWrapper) env = VecNormalize( env, norm_obs=True, norm_reward=True, clip_obs=10., ) model = PPO('MlpPolicy', env, verbose=1, tensorboard_log="hopper_ppo", batch_size=128, n_steps=512, gamma=0.99, gae_lambda=0.92, alpha=99 * 2., beta=1 * 2.,
self.pbar = tqdm(total=self.total_timesteps) return ProgressBarCallback(self.pbar) def __exit__(self, exc_type, exc_val, exc_tb): # close the callback self.pbar.n = self.total_timesteps self.pbar.update(0) self.pbar.close() # Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = make_vec_env('foo-v0', n_envs=1, monitor_dir=log_dir) tic = time.perf_counter() # Create callbacks save_callback = SaveOnBestTrainingRewardCallback(check_freq=10000, log_dir=log_dir) model = stable_baselines3.DQN('MlpPolicy', env, verbose=0, learning_rate=1e-4) model = model.load('AAA', env) steps = 10e6 with ProgressBarManager(steps) as progress_callback: # This is equivalent to callback=CallbackList([progress_callback, auto_save_callback]) model = model.learn(steps, callback=[progress_callback, save_callback]) model.save('AAA')
raise ValueError(f'Unrecognized action {action}') self._state = np.clip(self._state, 0, self._grid_size - 1) done = bool(self._state == self._grid_size - 1) reward = 1 if done else 0 return np.array([self._state]).astype(np.float32), reward, done, {} def reset(self): self._state = 0 return np.array([self._state]).astype(np.float32) def render(self, mode='human'): pass if __name__ == '__main__': check_env(GridWorld(10)) env = make_vec_env(lambda: GridWorld(10), n_envs=1) model = PPO('MlpPolicy', env, verbose=1).learn(5000) state = env.reset() for _ in range(20): action, _ = model.predict(state, deterministic=True) # action = 0 next_state, reward, done, info = env.step(action) print(f'{state} -> {action} -> {next_state}: {reward}') state = next_state if done: break
from stable_baselines3.a2c import MlpPolicy from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines3.common.policies import ActorCriticCnnPolicy from stable_baselines3.common.torch_layers import BaseFeaturesExtractor import torch import torch.nn as nn import torch.nn.functional as F from torchvision import transforms import numpy as np from model import BehaviorCloneNet, CarModel from logloader import LogLoader import time from torchvision.transforms import Compose, ToTensor, Normalize from custom_arch import CustomCNN, CustomActorCriticPolicy env = make_vec_env(DeepwatchEnv2) policy_kwargs = dict(features_extractor_class=CustomCNN) #check_env(env) n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) #model = TD3(CnnPolicy, env, action_noise=action_noise, buffer_size=50000, verbose=1) # optimize_memory_usage=True #model = SAC(CnnPolicy, env, buffer_size=50000, action_noise=action_noise, learning_rate=0.0005, tensorboard_log='./tensorboard', verbose=1) #model = SAC.load("deepwatch_evolution_sac_7", env) model = A2C(MlpPolicy, env, verbose=1, n_steps=5) #, policy_kwargs=policy_kwargs) model.load("deepwatch_evolution_a2c_2")
# Create log dir log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) # ----------------------------------------------------------- # environments setup #env = gym.make('Pendulum-v0') #env = Monitor(env, log_dir) # env_string = 'LunarLander-v2' # env = make_vec_env(env_string, n_envs=1, monitor_dir=log_dir) # Parallel environments # eval_env = gym.make(env_string) env_string = 'ransim-v0' env = make_vec_env(env_string, env_kwargs={"t_final": 5000}, n_envs=1, monitor_dir=log_dir) # Parallel environments eval_env = gym.make(env_string, t_final=5000) # ------------------------------------------------------------------------- # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=200, deterministic=True, render=False) ransim_callback = CustomRansimCallback(eval_env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=10 * 5 * 1e2,