Exemplo n.º 1
0
def train_simple_opponent(args):
    env_name = "WimblepongVisualBadAI-v0"
    env = gym.make(env_name)
    #env = ParallelEnvs(env_name, processes=4, envs_per_process=1)
    env = SubprocVecEnv(
        [make_env(env_name, args.seed + i) for i in range(args.num_envs)],
        start_method="spawn")
    env = VecFrameStack(env, n_stack=4)
    if args.algorithm.lower() == "dqn":
        agent = DQNagent.Agent(env_name, env.observation_space,
                               env.action_space)
    elif args.algorithm.lower() == "ppo":
        agent = ppo_agent_stack_4.Agent()
        agent.init_memory(args.steps_per_env, args.num_envs)
        agent.is_training = True
        if args.checkpoint:
            agent.load_checkpoint()
        elif args.pretrained_model:
            agent.load_model()
    else:
        raise NotImplementedError(
            f"No such algorithm: {args.algorithm.lower()}")

    train(env, agent, args)
    agent.save_policy()
    env.close()
Exemplo n.º 2
0
    def get_multiproc_env(self, n=10):
        def get_self():
            return deepcopy(self)

        e = SubprocVecEnv([get_self for _ in range(n)], start_method="fork")
        obs = e.reset()
        return e, obs
Exemplo n.º 3
0
    def create_env(n_envs, eval_env=False, no_log=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :param no_log: (bool) Do not log training when doing hyperparameter optim
            (issue with writing the same file)
        :return: (Union[gym.Env, VecEnv])
        """
        global hyperparams
        global env_kwargs

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env or no_log else save_path

        if n_envs == 1:
            env = SubprocVecEnv(
                [make_env(env_id, 0, args.seed, wrapper_class=env_wrapper, log_dir=log_dir, env_kwargs=env_kwargs)]
            )
        else:
            # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
            # On most env, SubprocVecEnv does not help and is quite memory hungry
            env = SubprocVecEnv(
                [
                    make_env(env_id, i, args.seed, log_dir=log_dir, env_kwargs=env_kwargs, wrapper_class=env_wrapper)
                    for i in range(n_envs)
                ]
            )
        if normalize:
            # Copy to avoid changing default values by reference
            local_normalize_kwargs = normalize_kwargs.copy()
            # Do not normalize reward for env used for evaluation
            if eval_env:
                if len(local_normalize_kwargs) > 0:
                    local_normalize_kwargs["norm_reward"] = False
                else:
                    local_normalize_kwargs = {"norm_reward": False}

            if args.verbose > 0:
                if len(local_normalize_kwargs) > 0:
                    print(f"Normalization activated: {local_normalize_kwargs}")
                else:
                    print("Normalizing input and reward")
            env = VecNormalize(env, **local_normalize_kwargs)

        # Optional Frame-stacking
        if hyperparams.get("frame_stack", False):
            n_stack = hyperparams["frame_stack"]
            env = VecFrameStack(env, n_stack)
            print(f"Stacking {n_stack} frames")

        if is_image_space(env.observation_space):
            if args.verbose > 0:
                print("Wrapping into a VecTransposeImage")
            env = VecTransposeImage(env)
        return env
Exemplo n.º 4
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Exemplo n.º 5
0
def run_model_stablebaseline3(flow_params,
                              num_cpus=1,
                              rollout_size=5,
                              num_steps=5):
    from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
    from stable_baselines3 import PPO
    from stable_baselines3.ppo import MlpPolicy
    import torch.nn as nn

    if num_cpus == 1:
        constructor = env_constructor(params=flow_params, version=0)()
        # The algorithms require a vectorized environment to run
        env = DummyVecEnv([lambda: constructor])
    else:
        env = SubprocVecEnv([
            env_constructor(params=flow_params, version=i)
            for i in range(num_cpus)
        ])

    train_model = PPO(MlpPolicy,
                      env=env,
                      verbose=1,
                      n_epochs=rollout_size,
                      tensorboard_log="./PPO_tensorboard/",
                      device="cuda")  # cpu, gpu selection
    # automatically select gpu
    train_model.learn(total_timesteps=num_steps * rollout_size)  #
    return train_model
Exemplo n.º 6
0
def make_env(seed: int,
             n_envs: int,
             run_dir: str,
             frame_skip: int,
             frame_stack: int,
             is_eval: bool = False) -> VecEnv:
    """
    Makes vectorized env with required wrappers
    :param seed: Random seed
    :param n_envs: Number of environment to run in parallel
    :param run_dir: Run directory
    :param frame_skip: Skip every nth frame
    :param frame_stack: Stack n frames together
    :param is_eval: True if used for evaluation
    :return: Vectorized env
    """
    if n_envs == 1:
        env = DummyVecEnv([_env_fn(seed, run_dir, frame_skip, is_eval)])
    else:
        env = SubprocVecEnv([
            _env_fn(seed + i, run_dir, frame_skip, is_eval)
            for i in range(n_envs)
        ])
    if frame_stack > 0:
        return VecFrameStack(env, n_stack=4)
    else:
        return env
Exemplo n.º 7
0
def main():
    num_cpu = 1
    load_version = ''
    save_version = '1b_v0'
    load_dir = '../models'
    save_dir = '../models'
    timesteps_per_checkpoint = int(1e6)
    num_checkpoints = int(1e1)  # controlling performance level of agent

    try:
        os.mkdir(save_dir)
    except OSError as error:
        pass

    alg_env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
    print('created alg env')

    train_policy = 'MlpPolicy'
    load_path = '{}/alg_v{}.zip'.format(load_dir, load_version)
    if os.path.exists(load_path):
        alg = PPO(train_policy, alg_env, verbose=0)
        alg.set_parameters(load_path, exact_match=True)
        # alg = PPO.load(load_path, env=alg_env)
        print('loaded alg checkpoint' + load_path)
    else:
        alg = PPO(train_policy, alg_env, verbose=0)
        print('created alg model')

    save_path = '{}/alg_v{}.zip'.format(save_dir, save_version)
    for _ in range(num_checkpoints):
        alg.learn(total_timesteps=timesteps_per_checkpoint)
        alg.save(save_path)
        print('saved alg checkpoint' + save_path)
Exemplo n.º 8
0
    def make_vec_envs(
        cls,
        evaluating: bool,
        num_processes: int,
        render: bool,
        synchronous: bool,
        log_dir=None,
        mp_kwargs: dict = None,
        **kwargs,
    ) -> VecPyTorch:
        if mp_kwargs is None:
            mp_kwargs = {}

        if num_processes == 1:
            synchronous = True

        if synchronous:
            kwargs.update(mp_kwargs)

        def env_thunk(rank):
            def thunk(**_kwargs):
                return cls.make_env(rank=rank,
                                    evaluating=evaluating,
                                    **_kwargs,
                                    **kwargs)

            return thunk

        env_fns = [env_thunk(i) for i in range(num_processes)]
        return VecPyTorch(
            DummyVecEnv(env_fns, render=render)
            if synchronous or num_processes == 1 else SubprocVecEnv(
                env_fns, **mp_kwargs, start_method="fork", render=render))
Exemplo n.º 9
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  gamma,
                  log_dir,
                  device,
                  allow_early_resets,
                  num_frame_stack=None):
    envs = [
        make_env(env_name, seed, i, log_dir, allow_early_resets)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, norm_reward=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack is not None:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Exemplo n.º 10
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    model = A2C('MlpPolicy', env, verbose=1, n_steps=5)
    model.learn(total_timesteps=2500000000)
Exemplo n.º 11
0
def create_vectorized_environment(
        n_envs: int, frame_stack: int,
        env_creation_func: t.Callable) -> VecTransposeImage:
    """Creates a vectorized environment for image-based models.

    :param n_envs: The number of parallel environment to run.
    :param frame_stack: The number of frame to stack in each environment.
    :param env_creation_func: A callable returning a Gym environment.
    :return: A vectorized environment with frame stacking and image transposition.
    """
    return VecTransposeImage(
        VecFrameStack(SubprocVecEnv([env_creation_func] * n_envs),
                      frame_stack))
Exemplo n.º 12
0
def make_venv(args):
    if not args.subproc:
        # Performs actions sequentially
        venv = DummyVecEnv(
            [make_env(args.env, args.subproc, i) for i in range(args.num_env)]
        )
    else:
        # Performs actions in parallel processes
        venv = SubprocVecEnv(
            [make_env(args.env, args.subproc, i) for i in range(args.num_env)]
        )

    return venv
def make_ai_matchmaker_stack(all_stats,
                             all_opps,
                             all_elos,
                             game_path,
                             model_dir,
                             base_port=50000,
                             image_based=False,
                             level_path=None,
                             env_p=3,
                             starting_elo=None,
                             K=16,
                             D=5.,
                             time_reward=-0.003,
                             num_envs=1,
                             matchmaking_mode=0,
                             win_loss_ratio="0:0"):

    envs = []
    for i in range(num_envs):
        envs.append(
            lambda a=all_stats, b=all_opps, c=all_elos, d=game_path, e=model_dir, f=base_port+(i*2), g=base_port+(i*2)+1, \
            h=image_based, i=level_path, j=env_p, k=starting_elo, l=time_reward, m=matchmaking_mode, \
            n=[int(x) for x in win_loss_ratio.split(':')]:
                    AIMatchmaker(a,b,c,d,e,
                            base_port=f,
                            my_port=g,
                            image_based=h,
                            level_path=i,
                            env_p=j,
                            starting_elo=k,
                            time_reward=l,
                            matchmaking_mode=m,
                            win_loss_ratio=n
                    )
        )
    env_stack = SubprocVecEnv(envs, start_method="fork")
    env_stack.reset()
    return env_stack
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'):
    print('record_video function')
    # Wrap the env in a Vec Video Recorder 
    local_eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)])
    local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    sync_envs_normalization(train_env, local_eval_env)
    local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath,
                              record_video_trigger=lambda step: step == 0, video_length=videoLength,
                              name_prefix=prefix)
    obs = local_eval_env.reset()
    for _ in range(videoLength):
        action, _ = model.predict(obs)
        obs, _, _, _ = local_eval_env.step(action)

    # Close the video recorder
    local_eval_env.close()
Exemplo n.º 15
0
def main():
    #env_id = "CartPole-v1"
    vix_env = trading_vix_env.trading_vix_env()
    num_cpu = 20  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(vix_env, i) for i in range(num_cpu)])

    # Create log dir
    log_dir = './ppo_data'
    os.makedirs(log_dir, exist_ok=True)
    env = VecMonitor(env, log_dir)
    callback = custom_call_back.CustomCallback(check_freq = 1000,log_dir = log_dir)

    model = PPO('MlpPolicy', env, verbose=1,n_steps=500,batch_size = 10000)
    model.learn(total_timesteps=2500000000,callback = callback)
Exemplo n.º 16
0
def make_env_stack(num_envs,
                   game_path,
                   base_port,
                   game_log_path,
                   opp_fp_and_elo,
                   trainee_elo,
                   elo_match=True,
                   survivor=False,
                   stdout_path=None,
                   level_path=None,
                   image_based=False,
                   time_reward=0.,
                   env_p=3):
    if num_envs >= 1:
        envs = []
        for i in range(num_envs):
            envs.append(lambda game_path=game_path, b=base_port +
                        (i * 2), c=game_log_path.replace(
                            ".txt", "-" + str(i) + ".txt"), d=opp_fp_and_elo, e
                        =elo_match, f=trainee_elo, g=survivor, h=stdout_path.
                        replace(".txt", "-" + str(i) + ".txt"), i=level_path, j
                        =image_based, k=time_reward: TankEnv(game_path,
                                                             game_port=b,
                                                             game_log_path=c,
                                                             opp_fp_and_elo=d,
                                                             elo_match=e,
                                                             center_elo=f,
                                                             survivor=g,
                                                             stdout_path=h,
                                                             verbose=True,
                                                             level_path=i,
                                                             image_based=j,
                                                             time_reward=k,
                                                             p=env_p))
        if num_envs == 1:
            env_stack = SubprocVecEnv(envs, start_method="fork")
        else:
            env_stack = SubprocVecEnv(envs, start_method="forkserver")
        env_stack.reset()
        return env_stack
    else:
        env = TankEnv(game_path,
                      game_port=base_port,
                      game_log_path=game_log_path,
                      opp_fp_and_elo=opp_fp_and_elo,
                      elo_match=elo_match,
                      center_elo=trainee_elo,
                      survivor=survivor,
                      stdout_path=stdout_path,
                      level_path=level_path,
                      image_based=image_based,
                      time_reward=time_reward,
                      p=env_p)
        env.reset()
        return env
Exemplo n.º 17
0
def make_ai_matchmaker_eval_stack(game_path, base_port, image_based, level_path, env_p, num_envs):
        envs = []
        for i in range(num_envs):
            envs.append(
                lambda a=game_path, b=base_port+(i*2), c=base_port+(i*2)+1, d=image_based, e=level_path, f=env_p: 
                    TankEnv(a, 
                        opp_fp_and_elo=[], 
                        game_port=b, 
                        my_port=c, 
                        elo_match=False,
                        image_based=d,
                        level_path=e,
                        p=f
                    )
            )
        env_stack = SubprocVecEnv(envs, start_method="fork")
        return env_stack
Exemplo n.º 18
0
def test_vec_env_is_wrapped():
    # Test is_wrapped call of subproc workers
    def make_env():
        return CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2)))

    def make_monitored_env():
        return Monitor(
            CustomGymEnv(gym.spaces.Box(low=np.zeros(2), high=np.ones(2))))

    # One with monitor, one without
    vec_env = SubprocVecEnv([make_env, make_monitored_env])

    assert vec_env.env_is_wrapped(Monitor) == [False, True]

    vec_env.close()

    # One with monitor, one without
    vec_env = DummyVecEnv([make_env, make_monitored_env])

    assert vec_env.env_is_wrapped(Monitor) == [False, True]

    vec_env = VecFrameStack(vec_env, n_stack=2)
    assert vec_env.env_is_wrapped(Monitor) == [False, True]
Exemplo n.º 19
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])
    #
    if args.stats_path is None:
        envs = VecNormalize(envs)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs)
        learner.learn(total_timesteps=10000000, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)
        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Exemplo n.º 20
0
def make_vec_envs(env_name,
                  seed,
                  dummy_vecenv,
                  parallel,
                  time_limit,
                  wrappers,
                  device,
                  monitor_dir=None):
    envs = [
        make_env(env_name, seed, i, time_limit, wrappers, monitor_dir)
        for i in range(parallel)
    ]

    if dummy_vecenv or len(envs) == 1 or monitor_dir:
        envs = MADummyVecEnv(envs)
    else:
        envs = SubprocVecEnv(envs, start_method="fork")

    envs = VecPyTorch(envs, device)
    return envs
Exemplo n.º 21
0
def multiprocessing_example():
    # Multiprocessing: Unleashing the Power of Vectorized Environments

    def make_env(env_id, rank, seed=0):
        """
		Utility function for multiprocessed env.

		:param env_id: (str) the environment ID.
		:param num_env: (int) the number of environments you wish to have in subprocesses.
		:param seed: (int) the inital seed for RNG.
		:param rank: (int) index of the subprocess.
		"""
        def _init():
            env = gym.make(env_id)
            env.seed(seed + rank)
            return env

        set_random_seed(seed)
        return _init

    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use.
    # Create the vectorized environment.
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    # Stable Baselines provides you with make_vec_env() helper which does exactly the previous steps for you.
    # You can choose between 'DummyVecEnv' (usually faster) and 'SubprocVecEnv'.
    #env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv)

    model = PPO("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=25_000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Exemplo n.º 22
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
Exemplo n.º 23
0
    #     obs, reward, done, _ = env.step(env.action_space.sample())
    #     if done==True:
    #         break
    #     env.render()


    def make_env(env: gym.Env, rank: int, seed: int = 0) -> Callable:
        """
        Utility function for multiprocessed env.
        
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        :return: (Callable)
        """
        def _init() -> gym.Env:
            env.seed(seed + rank)
            return env

        set_random_seed(seed)
        return _init

    params = {"learning_rate": 1e-5}
    vec_env = SubprocVecEnv([make_env(env, i) for i in range(4)])

    agent = A2C('MlpPolicy', vec_env, verbose=0)
    # agent = A2C(MlpPolicy, env, n_steps=1000, **params)

    agent.learn(total_timesteps=1000)
Exemplo n.º 24
0
def make_vec_env(
    env_name: str,
    n_envs: int = 8,
    seed: int = 0,
    parallel: bool = False,
    log_dir: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
    post_wrappers: Optional[Sequence[Callable[[gym.Env, int], gym.Env]]] = None,
) -> VecEnv:
    """Returns a VecEnv initialized with `n_envs` Envs.

    Args:
        env_name: The Env's string id in Gym.
        n_envs: The number of duplicate environments.
        seed: The environment seed.
        parallel: If True, uses SubprocVecEnv; otherwise, DummyVecEnv.
        log_dir: If specified, saves Monitor output to this directory.
        max_episode_steps: If specified, wraps each env in a TimeLimit wrapper
            with this episode length. If not specified and `max_episode_steps`
            exists for this `env_name` in the Gym registry, uses the registry
            `max_episode_steps` for every TimeLimit wrapper (this automatic
            wrapper is the default behavior when calling `gym.make`). Otherwise
            the environments are passed into the VecEnv unwrapped.
        post_wrappers: If specified, iteratively wraps each environment with each
            of the wrappers specified in the sequence. The argument should be a Callable
            accepting two arguments, the Env to be wrapped and the environment index,
            and returning the wrapped Env.
    """
    # Resolve the spec outside of the subprocess first, so that it is available to
    # subprocesses running `make_env` via automatic pickling.
    spec = gym.spec(env_name)

    def make_env(i, this_seed):
        # Previously, we directly called `gym.make(env_name)`, but running
        # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel`
        # created a weird interaction between Gym and Ray -- `gym.make` would fail
        # inside this function for any of our custom environment unless those
        # environments were also `gym.register()`ed inside `make_env`. Even
        # registering the custom environment in the scope of `make_vec_env` didn't
        # work. For more discussion and hypotheses on this issue see PR #160:
        # https://github.com/HumanCompatibleAI/imitation/pull/160.
        env = spec.make()

        # Seed each environment with a different, non-sequential seed for diversity
        # (even if caller is passing us sequentially-assigned base seeds). int() is
        # necessary to work around gym bug where it chokes on numpy int64s.
        env.seed(int(this_seed))

        if max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps)
        elif spec.max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps=spec.max_episode_steps)

        # Use Monitor to record statistics needed for Baselines algorithms logging
        # Optionally, save to disk
        log_path = None
        if log_dir is not None:
            log_subdir = os.path.join(log_dir, "monitor")
            os.makedirs(log_subdir, exist_ok=True)
            log_path = os.path.join(log_subdir, f"mon{i:03d}")

        env = monitor.Monitor(env, log_path)
        env = wrappers.RolloutInfoWrapper(env)

        if post_wrappers:
            for wrapper in post_wrappers:
                env = wrapper(env, i)

        return env

    rng = np.random.RandomState(seed)
    env_seeds = rng.randint(0, (1 << 31) - 1, (n_envs,))
    env_fns = [functools.partial(make_env, i, s) for i, s in enumerate(env_seeds)]
    if parallel:
        # See GH hill-a/stable-baselines issue #217
        return SubprocVecEnv(env_fns, start_method="forkserver")
    else:
        return DummyVecEnv(env_fns)
Exemplo n.º 25
0
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed


def make_env(env_id, rank, seed=0):
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init


if __name__ == '__main__':
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = PPO('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Exemplo n.º 26
0
def create_test_env(env_id,
                    n_envs=1,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None,
                    env_kwargs=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor
    :return: (gym.Env)
    """
    # HACK to save logs
    # if log_dir is not None:
    #     os.environ["OPENAI_LOG_FORMAT"] = 'csv'
    #     os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
    #     os.makedirs(log_dir, exist_ok=True)
    #     logger.configure()

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id,
                     i,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs) for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = SubprocVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs)
        ])
    else:
        env = DummyVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs)
        ])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Exemplo n.º 27
0
def create_test_env(
    env_id, n_envs=1, stats_path=None, seed=0, log_dir="", should_render=True, hyperparams=None, env_kwargs=None
):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor
    :return: (gym.Env)
    """
    # HACK to save logs
    # if log_dir is not None:
    #     os.environ["OPENAI_LOG_FORMAT"] = 'csv'
    #     os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
    #     os.makedirs(log_dir, exist_ok=True)
    #     logger.configure()

    # Clean hyperparams, so the dict can be pass to the model constructor
    if True:
        keys_to_delete = ["n_envs", "n_timesteps", "env_wrapper", "callback", "frame_stack"]
        for key in keys_to_delete:
            delete_key(hyperparams, key)

    if n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv(
            [make_env(env_id, i, seed, log_dir, env_kwargs=env_kwargs) for i in range(n_envs)]
        )
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id or "Walker2D" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)])
    else:
        env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams["normalize"]:
            # print("Loading running average")
            # print("with params: {}".format(hyperparams["normalize_kwargs"]))
            path_ = os.path.join(stats_path, "vecnormalize.pkl")
            if os.path.exists(path_):
                env = VecNormalize.load(path_, env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                raise ValueError(f"VecNormalize stats {path_} not found")

        n_stack = hyperparams.get("frame_stack", 0)
        if n_stack > 0:
            print(f"Stacking {n_stack} frames")
            env = VecFrameStack(env, n_stack)
    return env
Exemplo n.º 28
0
    tb_log_folder = 'ppo_fetchpush_tensorboard'
    tb_log_name = '2M_OSC_POSE'
    load_model_for_training_path = None
    load_vecnormalize_for_training_path = 'trained_models/vec_normalize_6M_OSC_POSE.pkl'
    save_model_folder = 'trained_models'
    save_model_filename = '2M_OSC_POSE'
    load_model_folder = 'trained_models'
    load_model_filename = '2M_OSC_POSE'

    save_model_path = os.path.join(save_model_folder, save_model_filename)
    save_vecnormalize_path = os.path.join(save_model_folder, 'vec_normalize_' + save_model_filename + '.pkl')
    load_model_path = os.path.join(load_model_folder, load_model_filename)
    load_vecnormalize_path = os.path.join(load_model_folder, 'vec_normalize_' + load_model_filename + '.pkl')

    if training:
        env = SubprocVecEnv([make_training_env(env_id, options, i) for i in range(num_cpu)])
        env = VecNormalize(env)

        if isinstance(load_model_for_training_path, str):
            env = VecNormalize.load(load_vecnormalize_for_training_path, env)
            model = PPO.load(load_model_for_training_path, env=env)
        else:
            model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log_folder)
        
        eval_env_func = make_training_env(env_id, options, rank=num_cpu)
        eval_env = DummyVecEnv([eval_env_func])
        eval_env = VecNormalize(eval_env)

        eval_callback = EvalCallback(eval_env, best_model_save_path='./best_models/',
                             log_path='./logs_best_model/',
                             deterministic=True, render=False, n_eval_episodes=10)
Exemplo n.º 29
0
def main(config: DictConfig) -> None:

    start_time = time.time()

    set_up(config)

    device = torch.device('cuda:' +
                          str(config.core.gpu_id) if torch.cuda.is_available()
                          and config.core.use_gpu else 'cpu')

    # Set up Wandb (Pass config variables to wandb)
    if config.log.use_wandb:
        hparams = {}
        for key, value in config.items():
            hparams.update(value)

        wandb.init(project="GRF_RL_training", config=hparams)

    if config.log.use_wandb:
        log_handler = wandb
    else:
        log_handler = None

    # Lambda Function to Create Environment
    def make_env(i):
        def thunk():
            if not config.env.use_kaggle_wrapper:
                env = FootballEnvWrapper(
                    env_name=config.env.env_name,
                    obs_representation=config.env.obs_representation,
                    rewards=config.env.rewards,
                    logdir=config.store.log_path,
                    env_id=i)
            else:
                print("Training against agent: " +
                      join(config.env.adversarial_agent_path,
                           config.env.adversarial_agent))
                env = KaggleEnvWrapper(
                    adversarial_agent=join(config.env.adversarial_agent_path,
                                           config.env.adversarial_agent),
                    env_name=config.env.env_name,
                    obs_representation=config.env.obs_representation,
                    rewards=config.env.rewards,
                    logdir=config.store.log_path,
                    env_id=i)
            env.seed(i)
            return env

        return thunk

    if config.env.parallel_env:
        envs = SubprocVecEnv([make_env(i) for i in range(config.env.num_envs)])
    else:
        envs = DummyVecEnv([make_env(i) for i in range(config.env.num_envs)])

    policy_kwargs = dict(
        features_extractor_class=ImpalaCNN,
        features_extractor_kwargs=dict(features_dim=256),
    )

    # Stable-baselines3 PPO
    model = PPO(policy="CnnPolicy",
                policy_kwargs=policy_kwargs,
                env=envs,
                learning_rate=config.train.learning_rate,
                n_steps=config.train.num_steps,
                n_epochs=config.train.update_epochs,
                batch_size=config.train.batch_size,
                clip_range=config.train.clip_range,
                gamma=config.train.gamma,
                gae_lambda=config.train.gae_lambda,
                max_grad_norm=config.train.max_grad_norm,
                vf_coef=config.train.vf_coef,
                ent_coef=config.train.ent_coef,
                log_handler=log_handler,
                model_checkpoints_path=config.store.model_path,
                pretrained_model=join(config.model.pretrained_model_path,
                                      config.model.pretrained_model),
                use_prierarchy_loss=config.train.use_prierarchy_loss,
                device=device,
                verbose=1)
    model.learn(total_timesteps=1000000000, log_interval=6)
Exemplo n.º 30
0
        env = MazeGridEnv()
        if sparse:
            env = SparseRewardWrapper(env)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init


if __name__ == '__main__':
    args = parser.parse_args()

    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv(
        [make_env(i, sparse=args.sparse) for i in range(num_cpu)])

    time_steps = args.time_steps
    model = A2C('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=time_steps)

    if not args.sparse:
        model_save += "/MazeGridEnv"
    else:
        model_save += "/SparseMazeGridEnv"

    if not os.path.exists(model_save):
        os.makedirs(model_save)

    model.save(model_save + "/A2C")