def train_simple_opponent(args): env_name = "WimblepongVisualBadAI-v0" env = gym.make(env_name) #env = ParallelEnvs(env_name, processes=4, envs_per_process=1) env = SubprocVecEnv( [make_env(env_name, args.seed + i) for i in range(args.num_envs)], start_method="spawn") env = VecFrameStack(env, n_stack=4) if args.algorithm.lower() == "dqn": agent = DQNagent.Agent(env_name, env.observation_space, env.action_space) elif args.algorithm.lower() == "ppo": agent = ppo_agent_stack_4.Agent() agent.init_memory(args.steps_per_env, args.num_envs) agent.is_training = True if args.checkpoint: agent.load_checkpoint() elif args.pretrained_model: agent.load_model() else: raise NotImplementedError( f"No such algorithm: {args.algorithm.lower()}") train(env, agent, args) agent.save_policy() env.close()
def get_multiproc_env(self, n=10): def get_self(): return deepcopy(self) e = SubprocVecEnv([get_self for _ in range(n)], start_method="fork") obs = e.reset() return e, obs
def create_env(n_envs, eval_env=False, no_log=False): """ Create the environment and wrap it if necessary :param n_envs: (int) :param eval_env: (bool) Whether is it an environment used for evaluation or not :param no_log: (bool) Do not log training when doing hyperparameter optim (issue with writing the same file) :return: (Union[gym.Env, VecEnv]) """ global hyperparams global env_kwargs # Do not log eval env (issue with writing the same file) log_dir = None if eval_env or no_log else save_path if n_envs == 1: env = SubprocVecEnv( [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)] ) else: # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)]) # On most env, SubprocVecEnv does not help and is quite memory hungry env = SubprocVecEnv( [ make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper) for i in range(n_envs) ] ) if normalize: # Copy to avoid changing default values by reference local_normalize_kwargs = normalize_kwargs.copy() # Do not normalize reward for env used for evaluation if eval_env: if len(local_normalize_kwargs) > 0: local_normalize_kwargs["norm_reward"] = False else: local_normalize_kwargs = {"norm_reward": False} if args.verbose > 0: if len(local_normalize_kwargs) > 0: print(f"Normalization activated: {local_normalize_kwargs}") else: print("Normalizing input and reward") env = VecNormalize(env, **local_normalize_kwargs) # Optional Frame-stacking if hyperparams.get("frame_stack", False): n_stack = hyperparams["frame_stack"] env = VecFrameStack(env, n_stack) print(f"Stacking {n_stack} frames") if is_image_space(env.observation_space): if args.verbose > 0: print("Wrapping into a VecTransposeImage") env = VecTransposeImage(env) return env
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) if args.stats_path is None: envs = VecNormalize(envs, norm_obs=True, clip_obs=np.inf, norm_reward=False, clip_reward=np.inf) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) print("Do random explorations to build running averages") envs.reset() for _ in tqdm(range(1000)): random_action = np.stack( [envs.action_space.sample() for _ in range(n_envs)]) envs.step(random_action) envs.training = False # freeze the running averages (what a terrible variable name...) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs, device=args.device) learner.learn(total_timesteps=args.total_timesteps, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs, device=args.device, target_kl=2e-2) if args.device == 'cpu': torch.cuda.empty_cache() learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def run_model_stablebaseline3(flow_params, num_cpus=1, rollout_size=5, num_steps=5): from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines3 import PPO from stable_baselines3.ppo import MlpPolicy import torch.nn as nn if num_cpus == 1: constructor = env_constructor(params=flow_params, version=0)() # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: constructor]) else: env = SubprocVecEnv([ env_constructor(params=flow_params, version=i) for i in range(num_cpus) ]) train_model = PPO(MlpPolicy, env=env, verbose=1, n_epochs=rollout_size, tensorboard_log="./PPO_tensorboard/", device="cuda") # cpu, gpu selection # automatically select gpu train_model.learn(total_timesteps=num_steps * rollout_size) # return train_model
def make_env(seed: int, n_envs: int, run_dir: str, frame_skip: int, frame_stack: int, is_eval: bool = False) -> VecEnv: """ Makes vectorized env with required wrappers :param seed: Random seed :param n_envs: Number of environment to run in parallel :param run_dir: Run directory :param frame_skip: Skip every nth frame :param frame_stack: Stack n frames together :param is_eval: True if used for evaluation :return: Vectorized env """ if n_envs == 1: env = DummyVecEnv([_env_fn(seed, run_dir, frame_skip, is_eval)]) else: env = SubprocVecEnv([ _env_fn(seed + i, run_dir, frame_skip, is_eval) for i in range(n_envs) ]) if frame_stack > 0: return VecFrameStack(env, n_stack=4) else: return env
def main(): num_cpu = 1 load_version = '' save_version = '1b_v0' load_dir = '../models' save_dir = '../models' timesteps_per_checkpoint = int(1e6) num_checkpoints = int(1e1) # controlling performance level of agent try: os.mkdir(save_dir) except OSError as error: pass alg_env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) print('created alg env') train_policy = 'MlpPolicy' load_path = '{}/alg_v{}.zip'.format(load_dir, load_version) if os.path.exists(load_path): alg = PPO(train_policy, alg_env, verbose=0) alg.set_parameters(load_path, exact_match=True) # alg = PPO.load(load_path, env=alg_env) print('loaded alg checkpoint' + load_path) else: alg = PPO(train_policy, alg_env, verbose=0) print('created alg model') save_path = '{}/alg_v{}.zip'.format(save_dir, save_version) for _ in range(num_checkpoints): alg.learn(total_timesteps=timesteps_per_checkpoint) alg.save(save_path) print('saved alg checkpoint' + save_path)
def make_vec_envs( cls, evaluating: bool, num_processes: int, render: bool, synchronous: bool, log_dir=None, mp_kwargs: dict = None, **kwargs, ) -> VecPyTorch: if mp_kwargs is None: mp_kwargs = {} if num_processes == 1: synchronous = True if synchronous: kwargs.update(mp_kwargs) def env_thunk(rank): def thunk(**_kwargs): return cls.make_env(rank=rank, evaluating=evaluating, **_kwargs, **kwargs) return thunk env_fns = [env_thunk(i) for i in range(num_processes)] return VecPyTorch( DummyVecEnv(env_fns, render=render) if synchronous or num_processes == 1 else SubprocVecEnv( env_fns, **mp_kwargs, start_method="fork", render=render))
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, allow_early_resets, num_frame_stack=None): envs = [ make_env(env_name, seed, i, log_dir, allow_early_resets) for i in range(num_processes) ] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, norm_reward=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) model = A2C('MlpPolicy', env, verbose=1, n_steps=5) model.learn(total_timesteps=2500000000)
def create_vectorized_environment( n_envs: int, frame_stack: int, env_creation_func: t.Callable) -> VecTransposeImage: """Creates a vectorized environment for image-based models. :param n_envs: The number of parallel environment to run. :param frame_stack: The number of frame to stack in each environment. :param env_creation_func: A callable returning a Gym environment. :return: A vectorized environment with frame stacking and image transposition. """ return VecTransposeImage( VecFrameStack(SubprocVecEnv([env_creation_func] * n_envs), frame_stack))
def make_venv(args): if not args.subproc: # Performs actions sequentially venv = DummyVecEnv( [make_env(args.env, args.subproc, i) for i in range(args.num_env)] ) else: # Performs actions in parallel processes venv = SubprocVecEnv( [make_env(args.env, args.subproc, i) for i in range(args.num_env)] ) return venv
def make_ai_matchmaker_stack(all_stats, all_opps, all_elos, game_path, model_dir, base_port=50000, image_based=False, level_path=None, env_p=3, starting_elo=None, K=16, D=5., time_reward=-0.003, num_envs=1, matchmaking_mode=0, win_loss_ratio="0:0"): envs = [] for i in range(num_envs): envs.append( lambda a=all_stats, b=all_opps, c=all_elos, d=game_path, e=model_dir, f=base_port+(i*2), g=base_port+(i*2)+1, \ h=image_based, i=level_path, j=env_p, k=starting_elo, l=time_reward, m=matchmaking_mode, \ n=[int(x) for x in win_loss_ratio.split(':')]: AIMatchmaker(a,b,c,d,e, base_port=f, my_port=g, image_based=h, level_path=i, env_p=j, starting_elo=k, time_reward=l, matchmaking_mode=m, win_loss_ratio=n ) ) env_stack = SubprocVecEnv(envs, start_method="fork") env_stack.reset() return env_stack
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'): print('record_video function') # Wrap the env in a Vec Video Recorder local_eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) sync_envs_normalization(train_env, local_eval_env) local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath, record_video_trigger=lambda step: step == 0, video_length=videoLength, name_prefix=prefix) obs = local_eval_env.reset() for _ in range(videoLength): action, _ = model.predict(obs) obs, _, _, _ = local_eval_env.step(action) # Close the video recorder local_eval_env.close()
def main(): #env_id = "CartPole-v1" vix_env = trading_vix_env.trading_vix_env() num_cpu = 20 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)]) # Create log dir log_dir = './ppo_data' os.makedirs(log_dir, exist_ok=True) env = VecMonitor(env, log_dir) callback = custom_call_back.CustomCallback(check_freq = 1000,log_dir = log_dir) model = PPO('MlpPolicy', env, verbose=1,n_steps=500,batch_size = 10000) model.learn(total_timesteps=2500000000,callback = callback)
def make_env_stack(num_envs, game_path, base_port, game_log_path, opp_fp_and_elo, trainee_elo, elo_match=True, survivor=False, stdout_path=None, level_path=None, image_based=False, time_reward=0., env_p=3): if num_envs >= 1: envs = [] for i in range(num_envs): envs.append(lambda game_path=game_path, b=base_port + (i * 2), c=game_log_path.replace( ".txt", "-" + str(i) + ".txt"), d=opp_fp_and_elo, e =elo_match, f=trainee_elo, g=survivor, h=stdout_path. replace(".txt", "-" + str(i) + ".txt"), i=level_path, j =image_based, k=time_reward: TankEnv(game_path, game_port=b, game_log_path=c, opp_fp_and_elo=d, elo_match=e, center_elo=f, survivor=g, stdout_path=h, verbose=True, level_path=i, image_based=j, time_reward=k, p=env_p)) if num_envs == 1: env_stack = SubprocVecEnv(envs, start_method="fork") else: env_stack = SubprocVecEnv(envs, start_method="forkserver") env_stack.reset() return env_stack else: env = TankEnv(game_path, game_port=base_port, game_log_path=game_log_path, opp_fp_and_elo=opp_fp_and_elo, elo_match=elo_match, center_elo=trainee_elo, survivor=survivor, stdout_path=stdout_path, level_path=level_path, image_based=image_based, time_reward=time_reward, p=env_p) env.reset() return env
def make_ai_matchmaker_eval_stack(game_path, base_port, image_based, level_path, env_p, num_envs): envs = [] for i in range(num_envs): envs.append( lambda a=game_path, b=base_port+(i*2), c=base_port+(i*2)+1, d=image_based, e=level_path, f=env_p: TankEnv(a, opp_fp_and_elo=[], game_port=b, my_port=c, elo_match=False, image_based=d, level_path=e, p=f ) ) env_stack = SubprocVecEnv(envs, start_method="fork") return env_stack
def test_vec_env_is_wrapped(): # Test is_wrapped call of subproc workers def make_env(): return CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2))) def make_monitored_env(): return Monitor( CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2)))) # One with monitor, one without vec_env = SubprocVecEnv([make_env, make_monitored_env]) assert vec_env.env_is_wrapped(Monitor) == [False, True] vec_env.close() # One with monitor, one without vec_env = DummyVecEnv([make_env, make_monitored_env]) assert vec_env.env_is_wrapped(Monitor) == [False, True] vec_env = VecFrameStack(vec_env, n_stack=2) assert vec_env.env_is_wrapped(Monitor) == [False, True]
def main(args): wandb.init(project=args.project_name, name=args.run_name) n_envs = len(os.sched_getaffinity(0)) factory = EnvFactory(args.env) # Wrap the render_env = factory.make_env() # for rendering callback = CallbackList([]) # Wrap the environment around parallel processing friendly wrapper, unless debug is on if args.debug: envs = DummyVecEnv([factory.make_env for _ in range(n_envs)]) else: envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)]) # if args.stats_path is None: envs = VecNormalize(envs) else: envs = VecNormalize.load(args.stats_path, envs) eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs) callback.callbacks.append(eval_callback) # We use PPO by default, but it should be easy to swap out for other algorithms. if args.pretrained_path is not None: pretrained_path = args.pretrained_path learner = PPO.load(pretrained_path, envs) learner.learn(total_timesteps=10000000, callback=callback) else: policy_kwargs = dict( activation_fn=nn.ReLU, net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)], log_std_init=args.log_std_init, squash_output=False) learner = PPO(MlpPolicy, envs, n_steps=args.n_steps, verbose=1, policy_kwargs=policy_kwargs) learner.learn(total_timesteps=args.total_timesteps, callback=callback) render_env.close() envs.close()
def make_vec_envs(env_name, seed, dummy_vecenv, parallel, time_limit, wrappers, device, monitor_dir=None): envs = [ make_env(env_name, seed, i, time_limit, wrappers, monitor_dir) for i in range(parallel) ] if dummy_vecenv or len(envs) == 1 or monitor_dir: envs = MADummyVecEnv(envs) else: envs = SubprocVecEnv(envs, start_method="fork") envs = VecPyTorch(envs, device) return envs
def multiprocessing_example(): # Multiprocessing: Unleashing the Power of Vectorized Environments def make_env(env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID. :param num_env: (int) the number of environments you wish to have in subprocesses. :param seed: (int) the inital seed for RNG. :param rank: (int) index of the subprocess. """ def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use. # Create the vectorized environment. env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) # Stable Baselines provides you with make_vec_env() helper which does exactly the previous steps for you. # You can choose between 'DummyVecEnv' (usually faster) and 'SubprocVecEnv'. #env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv) model = PPO("MlpPolicy", env, verbose=1) model.learn(total_timesteps=25_000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
# obs, reward, done, _ = env.step(env.action_space.sample()) # if done==True: # break # env.render() def make_env(env: gym.Env, rank: int, seed: int = 0) -> Callable: """ Utility function for multiprocessed env. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param rank: (int) index of the subprocess :return: (Callable) """ def _init() -> gym.Env: env.seed(seed + rank) return env set_random_seed(seed) return _init params = {"learning_rate": 1e-5} vec_env = SubprocVecEnv([make_env(env, i) for i in range(4)]) agent = A2C('MlpPolicy', vec_env, verbose=0) # agent = A2C(MlpPolicy, env, n_steps=1000, **params) agent.learn(total_timesteps=1000)
def make_vec_env( env_name: str, n_envs: int = 8, seed: int = 0, parallel: bool = False, log_dir: Optional[str] = None, max_episode_steps: Optional[int] = None, post_wrappers: Optional[Sequence[Callable[[gym.Env, int], gym.Env]]] = None, ) -> VecEnv: """Returns a VecEnv initialized with `n_envs` Envs. Args: env_name: The Env's string id in Gym. n_envs: The number of duplicate environments. seed: The environment seed. parallel: If True, uses SubprocVecEnv; otherwise, DummyVecEnv. log_dir: If specified, saves Monitor output to this directory. max_episode_steps: If specified, wraps each env in a TimeLimit wrapper with this episode length. If not specified and `max_episode_steps` exists for this `env_name` in the Gym registry, uses the registry `max_episode_steps` for every TimeLimit wrapper (this automatic wrapper is the default behavior when calling `gym.make`). Otherwise the environments are passed into the VecEnv unwrapped. post_wrappers: If specified, iteratively wraps each environment with each of the wrappers specified in the sequence. The argument should be a Callable accepting two arguments, the Env to be wrapped and the environment index, and returning the wrapped Env. """ # Resolve the spec outside of the subprocess first, so that it is available to # subprocesses running `make_env` via automatic pickling. spec = gym.spec(env_name) def make_env(i, this_seed): # Previously, we directly called `gym.make(env_name)`, but running # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel` # created a weird interaction between Gym and Ray -- `gym.make` would fail # inside this function for any of our custom environment unless those # environments were also `gym.register()`ed inside `make_env`. Even # registering the custom environment in the scope of `make_vec_env` didn't # work. For more discussion and hypotheses on this issue see PR #160: # https://github.com/HumanCompatibleAI/imitation/pull/160. env = spec.make() # Seed each environment with a different, non-sequential seed for diversity # (even if caller is passing us sequentially-assigned base seeds). int() is # necessary to work around gym bug where it chokes on numpy int64s. env.seed(int(this_seed)) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps) elif spec.max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=spec.max_episode_steps) # Use Monitor to record statistics needed for Baselines algorithms logging # Optionally, save to disk log_path = None if log_dir is not None: log_subdir = os.path.join(log_dir, "monitor") os.makedirs(log_subdir, exist_ok=True) log_path = os.path.join(log_subdir, f"mon{i:03d}") env = monitor.Monitor(env, log_path) env = wrappers.RolloutInfoWrapper(env) if post_wrappers: for wrapper in post_wrappers: env = wrapper(env, i) return env rng = np.random.RandomState(seed) env_seeds = rng.randint(0, (1 << 31) - 1, (n_envs,)) env_fns = [functools.partial(make_env, i, s) for i, s in enumerate(env_seeds)] if parallel: # See GH hill-a/stable-baselines issue #217 return SubprocVecEnv(env_fns, start_method="forkserver") else: return DummyVecEnv(env_fns)
from stable_baselines3 import PPO from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.utils import set_random_seed def make_env(env_id, rank, seed=0): def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init if __name__ == '__main__': env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def create_test_env(env_id, n_envs=1, stats_path=None, seed=0, log_dir='', should_render=True, hyperparams=None, env_kwargs=None): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor :return: (gym.Env) """ # HACK to save logs # if log_dir is not None: # os.environ["OPENAI_LOG_FORMAT"] = 'csv' # os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) # os.makedirs(log_dir, exist_ok=True) # logger.configure() # Create the environment and wrap it if necessary env_wrapper = get_wrapper_class(hyperparams) if 'env_wrapper' in hyperparams.keys(): del hyperparams['env_wrapper'] if n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv([ make_env(env_id, i, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) for i in range(n_envs) ]) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id: # HACK: force SubprocVecEnv for Bullet env env = SubprocVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) else: env = DummyVecEnv([ make_env(env_id, 0, seed, log_dir, wrapper_class=env_wrapper, env_kwargs=env_kwargs) ]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams['normalize']: print("Loading running average") print("with params: {}".format(hyperparams['normalize_kwargs'])) env = VecNormalize(env, training=False, **hyperparams['normalize_kwargs']) if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')): env = VecNormalize.load( os.path.join(stats_path, 'vecnormalize.pkl'), env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: # Legacy: env.load_running_average(stats_path) n_stack = hyperparams.get('frame_stack', 0) if n_stack > 0: print("Stacking {} frames".format(n_stack)) env = VecFrameStack(env, n_stack) return env
def create_test_env( env_id, n_envs=1, stats_path=None, seed=0, log_dir="", should_render=True, hyperparams=None, env_kwargs=None ): """ Create environment for testing a trained agent :param env_id: (str) :param n_envs: (int) number of processes :param stats_path: (str) path to folder containing saved running averaged :param seed: (int) Seed for random number generator :param log_dir: (str) Where to log rewards :param should_render: (bool) For Pybullet env, display the GUI :param hyperparams: (dict) Additional hyperparams (ex: n_stack) :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor :return: (gym.Env) """ # HACK to save logs # if log_dir is not None: # os.environ["OPENAI_LOG_FORMAT"] = 'csv' # os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir) # os.makedirs(log_dir, exist_ok=True) # logger.configure() # Clean hyperparams, so the dict can be pass to the model constructor if True: keys_to_delete = ["n_envs", "n_timesteps", "env_wrapper", "callback", "frame_stack"] for key in keys_to_delete: delete_key(hyperparams, key) if n_envs > 1: # start_method = 'spawn' for thread safe env = SubprocVecEnv( [make_env(env_id, i, seed, log_dir, env_kwargs=env_kwargs) for i in range(n_envs)] ) # Pybullet envs does not follow gym.render() interface elif "Bullet" in env_id or "Walker2D" in env_id: # HACK: force SubprocVecEnv for Bullet env env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)]) else: env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)]) # Load saved stats for normalizing input and rewards # And optionally stack frames if stats_path is not None: if hyperparams["normalize"]: # print("Loading running average") # print("with params: {}".format(hyperparams["normalize_kwargs"])) path_ = os.path.join(stats_path, "vecnormalize.pkl") if os.path.exists(path_): env = VecNormalize.load(path_, env) # Deactivate training and reward normalization env.training = False env.norm_reward = False else: raise ValueError(f"VecNormalize stats {path_} not found") n_stack = hyperparams.get("frame_stack", 0) if n_stack > 0: print(f"Stacking {n_stack} frames") env = VecFrameStack(env, n_stack) return env
tb_log_folder = 'ppo_fetchpush_tensorboard' tb_log_name = '2M_OSC_POSE' load_model_for_training_path = None load_vecnormalize_for_training_path = 'trained_models/vec_normalize_6M_OSC_POSE.pkl' save_model_folder = 'trained_models' save_model_filename = '2M_OSC_POSE' load_model_folder = 'trained_models' load_model_filename = '2M_OSC_POSE' save_model_path = os.path.join(save_model_folder, save_model_filename) save_vecnormalize_path = os.path.join(save_model_folder, 'vec_normalize_' + save_model_filename + '.pkl') load_model_path = os.path.join(load_model_folder, load_model_filename) load_vecnormalize_path = os.path.join(load_model_folder, 'vec_normalize_' + load_model_filename + '.pkl') if training: env = SubprocVecEnv([make_training_env(env_id, options, i) for i in range(num_cpu)]) env = VecNormalize(env) if isinstance(load_model_for_training_path, str): env = VecNormalize.load(load_vecnormalize_for_training_path, env) model = PPO.load(load_model_for_training_path, env=env) else: model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log_folder) eval_env_func = make_training_env(env_id, options, rank=num_cpu) eval_env = DummyVecEnv([eval_env_func]) eval_env = VecNormalize(eval_env) eval_callback = EvalCallback(eval_env, best_model_save_path='./best_models/', log_path='./logs_best_model/', deterministic=True, render=False, n_eval_episodes=10)
def main(config: DictConfig) -> None: start_time = time.time() set_up(config) device = torch.device('cuda:' + str(config.core.gpu_id) if torch.cuda.is_available() and config.core.use_gpu else 'cpu') # Set up Wandb (Pass config variables to wandb) if config.log.use_wandb: hparams = {} for key, value in config.items(): hparams.update(value) wandb.init(project="GRF_RL_training", config=hparams) if config.log.use_wandb: log_handler = wandb else: log_handler = None # Lambda Function to Create Environment def make_env(i): def thunk(): if not config.env.use_kaggle_wrapper: env = FootballEnvWrapper( env_name=config.env.env_name, obs_representation=config.env.obs_representation, rewards=config.env.rewards, logdir=config.store.log_path, env_id=i) else: print("Training against agent: " + join(config.env.adversarial_agent_path, config.env.adversarial_agent)) env = KaggleEnvWrapper( adversarial_agent=join(config.env.adversarial_agent_path, config.env.adversarial_agent), env_name=config.env.env_name, obs_representation=config.env.obs_representation, rewards=config.env.rewards, logdir=config.store.log_path, env_id=i) env.seed(i) return env return thunk if config.env.parallel_env: envs = SubprocVecEnv([make_env(i) for i in range(config.env.num_envs)]) else: envs = DummyVecEnv([make_env(i) for i in range(config.env.num_envs)]) policy_kwargs = dict( features_extractor_class=ImpalaCNN, features_extractor_kwargs=dict(features_dim=256), ) # Stable-baselines3 PPO model = PPO(policy="CnnPolicy", policy_kwargs=policy_kwargs, env=envs, learning_rate=config.train.learning_rate, n_steps=config.train.num_steps, n_epochs=config.train.update_epochs, batch_size=config.train.batch_size, clip_range=config.train.clip_range, gamma=config.train.gamma, gae_lambda=config.train.gae_lambda, max_grad_norm=config.train.max_grad_norm, vf_coef=config.train.vf_coef, ent_coef=config.train.ent_coef, log_handler=log_handler, model_checkpoints_path=config.store.model_path, pretrained_model=join(config.model.pretrained_model_path, config.model.pretrained_model), use_prierarchy_loss=config.train.use_prierarchy_loss, device=device, verbose=1) model.learn(total_timesteps=1000000000, log_interval=6)
env = MazeGridEnv() if sparse: env = SparseRewardWrapper(env) env.seed(seed + rank) return env set_random_seed(seed) return _init if __name__ == '__main__': args = parser.parse_args() num_cpu = 4 # Number of processes to use # Create the vectorized environment env = SubprocVecEnv( [make_env(i, sparse=args.sparse) for i in range(num_cpu)]) time_steps = args.time_steps model = A2C('MlpPolicy', env, verbose=1) model.learn(total_timesteps=time_steps) if not args.sparse: model_save += "/MazeGridEnv" else: model_save += "/SparseMazeGridEnv" if not os.path.exists(model_save): os.makedirs(model_save) model.save(model_save + "/A2C")