def __init__(self, env_fns, start_method=None): if start_method is None: start_method = 'forkserver' # thread safe by default SubprocVecEnv.__init__(self, env_fns, start_method=start_method) env = env_fns[0]() num_agents = getattr_unwrapped(env, 'num_agents') env.close() VecMultiEnv.__init__(self, self.num_envs, num_agents, self.observation_space, self.action_space)
def create_envs(self, game_name, state_name, num_env, render): for state in state_name: for i in range(num_env): print() self.env_fns.append( partial(make_env, game=game_name, state=state, render=render)) self.env_names.append(game_name + '-' + state) self.env = SubprocVecEnv(self.env_fns)
def create_training_env( number_of_processes, level='academy_empty_goal_close', stacked=True, representation='extracted', reward_experiment='scoring,checkpoints', write_goal_dumps=False, write_full_episode_dumps=False, write_video=False, dump_frequency=1, ) -> SubprocVecEnv: """ Meaning of all variables you can find in footbal.gfootball.examples.run_ppo2.py :return: stable_baselines.common.vec_env.subproc_vec_env.SubprocVecEnv """ return SubprocVecEnv([(lambda _i=i: _create_single_football_env( process_number=_i, level=level, stacked=stacked, representation=representation, reward_experiment=reward_experiment, write_goal_dumps=write_goal_dumps, write_full_episode_dumps=write_full_episode_dumps, write_video=write_video, dump_frequency=dump_frequency, render=False, )) for i in range(number_of_processes)])
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0, allow_early_resets=True): """ Create a wrapped, monitored SubprocVecEnv for Atari. :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses :param seed: (int) the inital seed for RNG :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function :param start_index: (int) start rank index :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The atari environment """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def retrain(self, game, state, num_e=1, n_timesteps=2000, save='my-model'): self.create_envs(game_name=game, state_name=state, num_env=num_e, render=self.FLAGS.render) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): self.model = stable_PPO2.load(self.FLAGS.model, policy=CnnPolicy, env=SubprocVecEnv(self.env_fns), n_steps=8192, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=0.001, learning_rate=lambda _: 2e-5, cliprange=lambda _: 0.2, verbose=1, tensorboard_log=self.FLAGS.logdir) self.model.learn(n_timesteps) self.model.save(save + '1') self.model.learn(n_timesteps) self.model.save(save + '2') self.model.learn(n_timesteps) self.model.save(save + '3') self.model.learn(n_timesteps) self.model.save(save + '4') self.model.learn(n_timesteps) self.model.save(save + '5')
def evaluate(self, game, state, num_e=1, num_steps=14400): self.create_envs(game_name=game, state_name=state, num_env=num_e, render=self.FLAGS.render) self.model = stable_PPO2.load(self.FLAGS.model, SubprocVecEnv(self.env_fns), policy=CnnPolicy, tensorboard_log=self.FLAGS.logdir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] obs = self.env.reset() for i in range(num_steps): # _states are only useful when using LSTM policies actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env obs, rewards, dones, info = self.env.step(actions) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, allow_early_resets, num_frame_stack=None): envs = [ make_env(env_name, seed, i, log_dir, allow_early_resets) for i in range(num_processes) ] if len(envs) > 1: # envs = ShmemVecEnv(envs, context='fork') envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): if "num_population" in args.__dict__: args.num_cpu = args.num_population * 2 assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \ "Error: cannot have more than 1 CPU for the environment {}".format(args.env) if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(args.num_cpu, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe envs = [ makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=True, env_kwargs=env_kwargs) for i in range(args.num_cpu) ] envs = SubprocVecEnv(envs) envs = VecFrameStack(envs, args.num_stack) if args.srl_model != "raw_pixels" and args.algo_type == "v2": envs = VecNormalize(envs, norm_obs=True, norm_reward=False) envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise) return envs
def make_tutankham_env_test(): def make_env(): def _thunk(): env = gym.make('Tutankham-v4') return wrap_env(env, False) return _thunk return SubprocVecEnv([make_env()])
def make_tutankham_env(num_env, seed=0, start_index=0): def make_env(rank): def _thunk(): env = gym.make('Tutankham-v4') env.seed(seed + rank) env = Monitor(env, filename=None, allow_early_resets=True) return wrap_env(env, True) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def create_demo_env( level='academy_empty_goal_close', reward_experiment='scoring,checkpoints', stacked=True, representation='extracted', render=False, ): return SubprocVecEnv([(lambda _i=i: _create_single_football_env( process_number=_i, level=level, stacked=stacked, representation=representation, reward_experiment=reward_experiment, write_goal_dumps=False, write_full_episode_dumps=False, write_video=False, dump_frequency=1, render=render, )) for i in range(1)])
class PPO2: def __init__(self, FLAGS): self.FLAGS = FLAGS self.env_fns = [] self.env_names = [] self.environs = [ 'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3', 'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1', 'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3', 'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1', 'LabyrinthZone.Act3', 'SpringYardZone.Act1', 'GreenHillZone.Act2', 'StarLightZone.Act3', 'ScrapBrainZone.Act1' ] self.environsv2 = ['1Player.Axel.Level1'] def create_envs(self, game_name, state_name, num_env, render): for state in state_name: for i in range(num_env): print() self.env_fns.append( partial(make_env, game=game_name, state=state, render=render)) self.env_names.append(game_name + '-' + state) self.env = SubprocVecEnv(self.env_fns) def train(self, game, state, num_e=1, n_timesteps=200000, save='ppo-model'): self.create_envs(game_name=game, state_name=state, num_env=num_e, render=self.FLAGS.render) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): self.model = stable_PPO2(policy=CnnPolicy, env=SubprocVecEnv(self.env_fns), n_steps=8192, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=0.001, learning_rate=lambda _: 2e-5, cliprange=lambda _: 0.2, verbose=1, tensorboard_log=self.FLAGS.logdir) self.model.learn(n_timesteps) self.model.save(save + '1') self.model.learn(n_timesteps) self.model.save(save + '2') self.model.learn(n_timesteps) self.model.save(save + '3') self.model.learn(n_timesteps) self.model.save(save + '4') self.model.learn(n_timesteps) self.model.save(save + '5') def retrain(self, game, state, num_e=1, n_timesteps=2000, save='my-model'): self.create_envs(game_name=game, state_name=state, num_env=num_e, render=self.FLAGS.render) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): self.model = stable_PPO2.load(self.FLAGS.model, policy=CnnPolicy, env=SubprocVecEnv(self.env_fns), n_steps=8192, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=0.001, learning_rate=lambda _: 2e-5, cliprange=lambda _: 0.2, verbose=1, tensorboard_log=self.FLAGS.logdir) self.model.learn(n_timesteps) self.model.save(save + '1') self.model.learn(n_timesteps) self.model.save(save + '2') self.model.learn(n_timesteps) self.model.save(save + '3') self.model.learn(n_timesteps) self.model.save(save + '4') self.model.learn(n_timesteps) self.model.save(save + '5') def evaluate(self, game, state, num_e=1, num_steps=14400): self.create_envs(game_name=game, state_name=state, num_env=num_e, render=self.FLAGS.render) self.model = stable_PPO2.load(self.FLAGS.model, SubprocVecEnv(self.env_fns), policy=CnnPolicy, tensorboard_log=self.FLAGS.logdir) episode_rewards = [[0.0] for _ in range(self.env.num_envs)] obs = self.env.reset() for i in range(num_steps): # _states are only useful when using LSTM policies actions, _states = self.model.predict(obs) # # here, action, rewards and dones are arrays # # because we are using vectorized env obs, rewards, dones, info = self.env.step(actions) # Stats for i in range(self.env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) mean_rewards = [0.0 for _ in range(self.env.num_envs)] n_episodes = 0 for i in range(self.env.num_envs): mean_rewards[i] = np.mean(episode_rewards[i]) n_episodes += len(episode_rewards[i]) # Compute mean reward mean_reward = np.mean(mean_rewards) print("Mean reward:", mean_reward, "Num episodes:", n_episodes) return mean_reward