示例#1
0
 def __init__(self, env_fns, start_method=None):
     if start_method is None:
         start_method = 'forkserver'  # thread safe by default
     SubprocVecEnv.__init__(self, env_fns, start_method=start_method)
     env = env_fns[0]()
     num_agents = getattr_unwrapped(env, 'num_agents')
     env.close()
     VecMultiEnv.__init__(self, self.num_envs, num_agents,
                          self.observation_space, self.action_space)
示例#2
0
 def create_envs(self, game_name, state_name, num_env, render):
     for state in state_name:
         for i in range(num_env):
             print()
             self.env_fns.append(
                 partial(make_env,
                         game=game_name,
                         state=state,
                         render=render))
             self.env_names.append(game_name + '-' + state)
     self.env = SubprocVecEnv(self.env_fns)
示例#3
0
def create_training_env(
    number_of_processes,
    level='academy_empty_goal_close',
    stacked=True,
    representation='extracted',
    reward_experiment='scoring,checkpoints',
    write_goal_dumps=False,
    write_full_episode_dumps=False,
    write_video=False,
    dump_frequency=1,
) -> SubprocVecEnv:
    """
    Meaning of all variables you can find in footbal.gfootball.examples.run_ppo2.py
    :return: stable_baselines.common.vec_env.subproc_vec_env.SubprocVecEnv
    """
    return SubprocVecEnv([(lambda _i=i: _create_single_football_env(
        process_number=_i,
        level=level,
        stacked=stacked,
        representation=representation,
        reward_experiment=reward_experiment,
        write_goal_dumps=write_goal_dumps,
        write_full_episode_dumps=write_full_episode_dumps,
        write_video=write_video,
        dump_frequency=dump_frequency,
        render=False,
    )) for i in range(number_of_processes)])
示例#4
0
def make_atari_env(env_id,
                   num_env,
                   seed,
                   wrapper_kwargs=None,
                   start_index=0,
                   allow_early_resets=True):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function
    :param start_index: (int) start rank index
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The atari environment
    """
    if wrapper_kwargs is None:
        wrapper_kwargs = {}

    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(), str(rank)),
                          allow_early_resets=allow_early_resets)
            return wrap_deepmind(env, **wrapper_kwargs)

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
示例#5
0
    def retrain(self, game, state, num_e=1, n_timesteps=2000, save='my-model'):
        self.create_envs(game_name=game,
                         state_name=state,
                         num_env=num_e,
                         render=self.FLAGS.render)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config):
            self.model = stable_PPO2.load(self.FLAGS.model,
                                          policy=CnnPolicy,
                                          env=SubprocVecEnv(self.env_fns),
                                          n_steps=8192,
                                          nminibatches=8,
                                          lam=0.95,
                                          gamma=0.99,
                                          noptepochs=4,
                                          ent_coef=0.001,
                                          learning_rate=lambda _: 2e-5,
                                          cliprange=lambda _: 0.2,
                                          verbose=1,
                                          tensorboard_log=self.FLAGS.logdir)

        self.model.learn(n_timesteps)
        self.model.save(save + '1')
        self.model.learn(n_timesteps)
        self.model.save(save + '2')
        self.model.learn(n_timesteps)
        self.model.save(save + '3')
        self.model.learn(n_timesteps)
        self.model.save(save + '4')
        self.model.learn(n_timesteps)
        self.model.save(save + '5')
示例#6
0
    def evaluate(self, game, state, num_e=1, num_steps=14400):
        self.create_envs(game_name=game,
                         state_name=state,
                         num_env=num_e,
                         render=self.FLAGS.render)
        self.model = stable_PPO2.load(self.FLAGS.model,
                                      SubprocVecEnv(self.env_fns),
                                      policy=CnnPolicy,
                                      tensorboard_log=self.FLAGS.logdir)
        episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
        obs = self.env.reset()
        for i in range(num_steps):
            # _states are only useful when using LSTM policies
            actions, _states = self.model.predict(obs)
            # # here, action, rewards and dones are arrays
            # # because we are using vectorized env
            obs, rewards, dones, info = self.env.step(actions)

            # Stats
            for i in range(self.env.num_envs):
                episode_rewards[i][-1] += rewards[i]
                if dones[i]:
                    episode_rewards[i].append(0.0)

        mean_rewards = [0.0 for _ in range(self.env.num_envs)]
        n_episodes = 0
        for i in range(self.env.num_envs):
            mean_rewards[i] = np.mean(episode_rewards[i])
            n_episodes += len(episode_rewards[i])

        # Compute mean reward
        mean_reward = np.mean(mean_rewards)
        print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward
示例#7
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  gamma,
                  log_dir,
                  device,
                  allow_early_resets,
                  num_frame_stack=None):
    envs = [
        make_env(env_name, seed, i, log_dir, allow_early_resets)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        # envs = ShmemVecEnv(envs, context='fork')
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack is not None:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
示例#8
0
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        if "num_population" in args.__dict__:
            args.num_cpu = args.num_population * 2

        assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \
            "Error: cannot have more than 1 CPU for the environment {}".format(args.env)
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(args.num_cpu, args.env,
                                             env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        envs = [
            makeEnv(args.env,
                    args.seed,
                    i,
                    args.log_dir,
                    allow_early_resets=True,
                    env_kwargs=env_kwargs) for i in range(args.num_cpu)
        ]
        envs = SubprocVecEnv(envs)
        envs = VecFrameStack(envs, args.num_stack)
        if args.srl_model != "raw_pixels" and args.algo_type == "v2":
            envs = VecNormalize(envs, norm_obs=True, norm_reward=False)
            envs = loadRunningAverage(envs,
                                      load_path_normalise=load_path_normalise)
        return envs
def make_tutankham_env_test():
    def make_env():
        def _thunk():
            env = gym.make('Tutankham-v4')
            return wrap_env(env, False)

        return _thunk

    return SubprocVecEnv([make_env()])
示例#10
0
def make_tutankham_env(num_env, seed=0, start_index=0):
    def make_env(rank):
        def _thunk():
            env = gym.make('Tutankham-v4')
            env.seed(seed + rank)
            env = Monitor(env, filename=None, allow_early_resets=True)
            return wrap_env(env, True)

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
示例#11
0
def create_demo_env(
    level='academy_empty_goal_close',
    reward_experiment='scoring,checkpoints',
    stacked=True,
    representation='extracted',
    render=False,
):
    return SubprocVecEnv([(lambda _i=i: _create_single_football_env(
        process_number=_i,
        level=level,
        stacked=stacked,
        representation=representation,
        reward_experiment=reward_experiment,
        write_goal_dumps=False,
        write_full_episode_dumps=False,
        write_video=False,
        dump_frequency=1,
        render=render,
    )) for i in range(1)])
示例#12
0
class PPO2:
    def __init__(self, FLAGS):
        self.FLAGS = FLAGS
        self.env_fns = []
        self.env_names = []
        self.environs = [
            'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3',
            'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1',
            'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3',
            'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1',
            'LabyrinthZone.Act3', 'SpringYardZone.Act1', 'GreenHillZone.Act2',
            'StarLightZone.Act3', 'ScrapBrainZone.Act1'
        ]
        self.environsv2 = ['1Player.Axel.Level1']

    def create_envs(self, game_name, state_name, num_env, render):
        for state in state_name:
            for i in range(num_env):
                print()
                self.env_fns.append(
                    partial(make_env,
                            game=game_name,
                            state=state,
                            render=render))
                self.env_names.append(game_name + '-' + state)
        self.env = SubprocVecEnv(self.env_fns)

    def train(self,
              game,
              state,
              num_e=1,
              n_timesteps=200000,
              save='ppo-model'):
        self.create_envs(game_name=game,
                         state_name=state,
                         num_env=num_e,
                         render=self.FLAGS.render)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config):
            self.model = stable_PPO2(policy=CnnPolicy,
                                     env=SubprocVecEnv(self.env_fns),
                                     n_steps=8192,
                                     nminibatches=8,
                                     lam=0.95,
                                     gamma=0.99,
                                     noptepochs=4,
                                     ent_coef=0.001,
                                     learning_rate=lambda _: 2e-5,
                                     cliprange=lambda _: 0.2,
                                     verbose=1,
                                     tensorboard_log=self.FLAGS.logdir)

        self.model.learn(n_timesteps)
        self.model.save(save + '1')
        self.model.learn(n_timesteps)
        self.model.save(save + '2')
        self.model.learn(n_timesteps)
        self.model.save(save + '3')
        self.model.learn(n_timesteps)
        self.model.save(save + '4')
        self.model.learn(n_timesteps)
        self.model.save(save + '5')

    def retrain(self, game, state, num_e=1, n_timesteps=2000, save='my-model'):
        self.create_envs(game_name=game,
                         state_name=state,
                         num_env=num_e,
                         render=self.FLAGS.render)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config):
            self.model = stable_PPO2.load(self.FLAGS.model,
                                          policy=CnnPolicy,
                                          env=SubprocVecEnv(self.env_fns),
                                          n_steps=8192,
                                          nminibatches=8,
                                          lam=0.95,
                                          gamma=0.99,
                                          noptepochs=4,
                                          ent_coef=0.001,
                                          learning_rate=lambda _: 2e-5,
                                          cliprange=lambda _: 0.2,
                                          verbose=1,
                                          tensorboard_log=self.FLAGS.logdir)

        self.model.learn(n_timesteps)
        self.model.save(save + '1')
        self.model.learn(n_timesteps)
        self.model.save(save + '2')
        self.model.learn(n_timesteps)
        self.model.save(save + '3')
        self.model.learn(n_timesteps)
        self.model.save(save + '4')
        self.model.learn(n_timesteps)
        self.model.save(save + '5')

    def evaluate(self, game, state, num_e=1, num_steps=14400):
        self.create_envs(game_name=game,
                         state_name=state,
                         num_env=num_e,
                         render=self.FLAGS.render)
        self.model = stable_PPO2.load(self.FLAGS.model,
                                      SubprocVecEnv(self.env_fns),
                                      policy=CnnPolicy,
                                      tensorboard_log=self.FLAGS.logdir)
        episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
        obs = self.env.reset()
        for i in range(num_steps):
            # _states are only useful when using LSTM policies
            actions, _states = self.model.predict(obs)
            # # here, action, rewards and dones are arrays
            # # because we are using vectorized env
            obs, rewards, dones, info = self.env.step(actions)

            # Stats
            for i in range(self.env.num_envs):
                episode_rewards[i][-1] += rewards[i]
                if dones[i]:
                    episode_rewards[i].append(0.0)

        mean_rewards = [0.0 for _ in range(self.env.num_envs)]
        n_episodes = 0
        for i in range(self.env.num_envs):
            mean_rewards[i] = np.mean(episode_rewards[i])
            n_episodes += len(episode_rewards[i])

        # Compute mean reward
        mean_reward = np.mean(mean_rewards)
        print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward