Пример #1
0
def record_video(env_id,
                 model,
                 video_length=500,
                 prefix='',
                 video_folder='videos'):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
        """
    eval_env = DummyVecEnv(
        [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
    # eval_env = gym.make(env_id)
    val_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                eval_env)

    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(eval_env,
                                video_folder='tmp',
                                record_video_trigger=lambda step: step == 0,
                                video_length=video_length,
                                name_prefix=prefix)

    obs = eval_env.reset()
    for i in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()
Пример #2
0
def pybullet_example():
    # PyBullet: Normalizing input features

    import pybullet_envs

    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    # Automatically normalize the input features and reward.
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

    model = PPO("MlpPolicy", env)
    model.learn(total_timesteps=2000)

    # Don't forget to save the VecNormalize statistics when saving the agent.
    log_dir = "/tmp/"
    model.save(log_dir + "ppo_halfcheetah")
    stats_path = os.path.join(log_dir, "vec_normalize.pkl")
    env.save(stats_path)

    # To demonstrate loading.
    del model, env

    # Load the saved statistics.
    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    env = VecNormalize.load(stats_path, env)
    # Do not update them at test time.
    env.training = False
    # reward normalization is not needed at test time.
    env.norm_reward = False

    # Load the agent.
    model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
Пример #3
0
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False):
    print("Testing:")
    print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info)
    eval_env = DummyVecEnv([eval_env])
    eval_env = VecNormalize.load(vec_filename, eval_env)
    eval_env.norm_reward = False

    eval_env.seed(seed)
    model = PPO.load(model_filename)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}")
    return total_reward, distance_x
Пример #4
0
def test_vec_env(tmp_path, make_env):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv,
                             norm_obs=True,
                             norm_reward=True,
                             clip_obs=clip_obs,
                             clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        if isinstance(obs, dict):
            for key in obs.keys():
                assert np.max(np.abs(obs[key])) <= clip_obs
        else:
            assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = tmp_path / "vec_normalize"
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
Пример #5
0
def create_zoo_env(env_id, stats_dir, hyperparams, should_render=False):
    env_wrapper = get_wrapper_class(hyperparams)

    vec_env_cls = DummyVecEnv
    if "Bullet" in env_id and should_render:
        vec_env_cls = SubprocVecEnv

    env = make_vec_env(env_id,
                       wrapper_class=env_wrapper,
                       vec_env_cls=vec_env_cls)

    if stats_dir is not None:
        if hyperparams["normalize"]:
            norm_fpath = pjoin(stats_dir, "vecnormalize.pkl")

            if os.path.exists(norm_fpath):
                env = VecNormalize.load(norm_fpath, env)
                env.training = False
                env.norm_reward = False
            else:
                raise ValueError(f"VecNormalize stats {norm_fpath} not found")

    max_episode_steps = gym.make(env_id).spec.max_episode_steps
    Spec = namedtuple("Spec", ["max_episode_steps"])
    env.spec = Spec(max_episode_steps=max_episode_steps)

    return env
Пример #6
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Пример #7
0
def main(args):
    expert = None
    expert_state_dim = 0
    if args.policy_path is not None:
        policy_path = args.policy_path
        expert = PPO.load(policy_path)
        expert_state_dim = expert.observation_space.shape[0]

    factory = EnvFactory(args.env)
    env = DummyVecEnv([factory.make_env])
    if args.stats_path is not None:
        env = VecNormalize.load(args.stats_path, env)
        env.training = False
    else:
        env = VecNormalize(env, training=False)

    obs = env.reset()
    env.render()
    total_reward = 0
    while True:
        if expert is None:
            action = env.action_space.sample()
            action = np.zeros_like(action)
        else:
            good_obs = obs[:, :expert_state_dim]
            action, _ = expert.predict(good_obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        reward = env.get_original_reward()
        total_reward += reward[0]
        if done:
            print("Total reward: {:.3f}".format(total_reward))
            obs = env.reset()
            total_reward = 0
Пример #8
0
def main():
    # multiprocess environment
    # n_cpu = 8
    # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    n_cpu = 1
    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=False)
    model.learn(total_timesteps=40000000)
    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)

        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
Пример #9
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
Пример #10
0
def run_environment(
        algorithm: RLAlgorithm = typer.Option(...),
        agent_type: SingleOrMultiAgent = SingleOrMultiAgent.single_agent,
        agent_parameters_path: Optional[Path] = None,
        random_agent: bool = False,
        seed: Optional[int] = None,
        environment_port: Optional[int] = None,
        normalize: bool = False,
        n_envs: Optional[int] = None):
    """Run the reacher environment and visualize the actions of the agents.

    Args:
        agent_type: choice between single and multi agent environments
        agent_parameters_path: an optional path to load the agent parameters from
        random_agent: if true, agent(s) use a random policy
        seed: seed for the environment; if not set, it will be picked randomly
        environment_port: the port used from python to communicate with the C# environment backend. By using different
            values, one can run multiple environments in parallel.
    """
    env = create_environment(agent_type=agent_type,
                             normalize=False,
                             n_envs=n_envs,
                             env_seed=seed,
                             environment_port=environment_port,
                             training_mode=False,
                             no_graphics=False)

    if normalize:
        env = VecNormalize.load(
            str(agent_parameters_path.parent / 'vecnormalize.pkl'), env)

    action_size = env.action_space.shape[0]

    if random_agent:
        agent = RandomAgent(number_of_agents=n_envs, action_size=action_size)
    else:
        agent = TrainedAgent(algorithm=algorithm,
                             parameters_path=str(agent_parameters_path))

    score = 0
    state = env.reset()
    while True:
        actions = agent.act(state)
        state, reward, done, _ = env.step(actions)
        score += reward
        time.sleep(0.005)
        if np.any(done):
            break

    if agent_type == SingleOrMultiAgent.single_agent:
        print(f'Total score this episode: {score}')
    else:
        print(f'Average total score this episode: {np.array(score).mean()}')

    env.close()
Пример #11
0
    def test(self, model_filename, vnorm_filename):
        self.model.load(model_filename)
        self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env)
        self.eval_env.render()
        obs = self.eval_env.reset()
        with self.model.policy.features_extractor.start_testing():
            for i in range(1000):
                action = self.model.predict(obs, deterministic=True)
                self.eval_env.step(action)

        self.eval_env.close()
Пример #12
0
def test(test_n, seed, model_filename, vec_filename, train, test, test_as_class=0, render=False, save_file="default.yml"):

    print("Testing:")
    total_rewards = []
    distance_xs = []
    for i in range(test_n):
        print(f" Seed {seed+i}, model {model_filename} vec {vec_filename}")
        print(f" Train on {train}, test on {test}, w/ bodyinfo {test_as_class}")
        eval_env = utils.make_env(render=render, wrapper=None, robot_body=test, body_info=test_as_class)
        eval_env = DummyVecEnv([eval_env])
        eval_env = VecNormalize.load(vec_filename, eval_env)
        eval_env.norm_reward = False

        eval_env.seed(seed+i)
        model = PPO.load(model_filename)

        obs = eval_env.reset()
        if render:
            eval_env.env_method("set_view")
        distance_x = 0
        # print(obs)
        total_reward = 0
        for step in range(1000):
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = eval_env.step(action)
            if done:
                break
            else:  # the last observation will be after reset, so skip the last
                distance_x = eval_env.envs[0].robot.body_xyz[0]
            total_reward += reward[0]
            if render:
                time.sleep(0.01)

        eval_env.close()
        print(f"train {train}, test {test}, test_as_class {test_as_class}, step {step}, total_reward {total_reward}, distance_x {distance_x}")

        total_rewards.append(total_reward)
        distance_xs.append(distance_x)

    # avoid yaml turn float64 to numpy array
    total_rewards = [float(x) for x in total_rewards]
    distance_xs = [float(x) for x in distance_xs]

    data = {
        "title": "test",
        "train": train,
        "test": test,
        "total_reward": total_rewards,
        "distance_x": distance_xs,
    }
    with open(f"{save_file}", "w") as f:
        yaml.dump(data, f)
Пример #13
0
def main():
    args = parse_arguments()
    load_path = os.path.join("logs", args.env, args.agent, "best_model.zip")
    stats_path = os.path.join(args.log_dir, args.env, args.agent, "vec_normalize.pkl")

    if args.agent == 'ddpg':
        from stable_baselines3 import DDPG
        model = DDPG.load(load_path)
    elif args.agent == 'td3':
        from stable_baselines3 import TD3
        model = TD3.load(load_path)
    elif args.agent == 'ppo':
        from stable_baselines3 import PPO
        model = PPO.load(load_path)

    env = make_vec_env(args.env, n_envs=1)
    env = VecNormalize.load(stats_path, env)
    #  do not update them at test time
    env.training = False
    # reward normalization is not needed at test time
    env.norm_reward = False
    
    # env = gym.make(args.env)
    img = []
    if args.render:
        env.render('human')
    done = False
    obs = env.reset()
    action = model.predict(obs)
    if args.gif:
        img.append(env.render('rgb_array'))

    if args.timesteps is None:
        while not done: 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()
    else:
        for i in range(args.timesteps): 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()

    if args.gif:
        imageio.mimsave(f'{os.path.join("logs", args.env, args.agent, "recording.gif")}', [np.array(img) for i, img in enumerate(img) if i%2 == 0], fps=29)
Пример #14
0
def load_training_env(env_id, env_path, log_dir, max_train_ep_length, seed):
    """Load a saved vectorized training env (used to continue training)."""
    env = gym.make(env_id)
    env.seed(seed)  # Set random seed
    env = TimeLimitWrapper(
        env, max_train_ep_length)  # Limit length of training episodes
    env = Monitor(env, log_dir)  # Monitor training
    env = NormalizeActionWrapper(env)  # Normalize action space
    env = DummyVecEnv([lambda: env])  # Vectorize environment
    env = VecNormalize.load(env_path, env)

    env.reset()

    return env
Пример #15
0
def load_visualization_env(env_id, env_path, seed=0):
    """
    Create an environment using the saved statistics of the training vectorized
    env (used to visualize performance).
    """
    env = gym.make(env_id)
    env.seed(seed)
    env = Monitor(
        env
    )  # Used to ensure original action space is not modified by `NormalizeActionWrapper`
    env = NormalizeActionWrapper(env)
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(env_path, env)

    return env
Пример #16
0
def test_vec_env(tmpdir):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = str(tmpdir.join("vec_normalize"))
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
Пример #17
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])
    #
    if args.stats_path is None:
        envs = VecNormalize(envs)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs)
        learner.learn(total_timesteps=10000000, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.policy_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)
        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Пример #18
0
def test_current_exp(args):
    if args.save_img:
        all_folders = glob.glob(os.path.join(img_path,"*"))
        all_folders = [os.path.basename(x) for x in all_folders]
        all_folders = [int(x) if x.isnumeric() else -1 for x in all_folders] + [0]
        current_folder = max(all_folders) + 1
        current_folder = os.path.join(img_path, str(current_folder))
        os.makedirs(current_folder, exist_ok=True)
        print(f"Writing into {current_folder}")
        input("Press Enter...")

    env = DummyVecEnv([make_env(env_id=args.env_id, rank=0, seed=0, render=True)])
    env = VecNormalize.load(args.vnorm_filename, env)
    model = CustomizedPPO.load(args.model_filename, env=env)
    callback = AdjustCameraCallback()
    
    obs = env.reset()
    callback.reset_lights(env.envs[0].env._p) # once window is opened, change the lighting

    if args.save_img:
        time.sleep(1) # please use this time to maximize the window, so that the image recorded will be full size

    with model.policy.features_extractor.start_testing():
        while True:
            for i in range(1000):
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                callback.camera_simpy_follow_robot(target_env=env.envs[0])
                if args.save_img:
                    callback.write_a_image(current_folder=current_folder, step=i, target_env=env.envs[0])
                    if obs.shape[1]>100: # With Vision I guess
                        image = np.rollaxis(obs[:, -3*8*8:].reshape([3,8,8]), 0, start=3) * 255.0
                        print(image.shape)
                        # image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                        cv2.imwrite(f"{current_folder}/vision_{i:05}.png", image)
                if done:
                    break
                time.sleep(0.01)
            break
        time.sleep(0.1)
    env.close()
Пример #19
0
    def _maybe_normalize(self, env: VecEnv, eval_env: bool) -> VecEnv:
        """
        Wrap the env into a VecNormalize wrapper if needed
        and load saved statistics when present.

        :param env:
        :param eval_env:
        :return:
        """
        # Pretrained model, load normalization
        path_ = os.path.join(os.path.dirname(self.trained_agent), self.env_id)
        path_ = os.path.join(path_, "vecnormalize.pkl")

        if os.path.exists(path_):
            print("Loading saved VecNormalize stats")
            env = VecNormalize.load(path_, env)
            # Deactivate training and reward normalization
            if eval_env:
                env.training = False
                env.norm_reward = False

        elif self.normalize:
            # Copy to avoid changing default values by reference
            local_normalize_kwargs = self.normalize_kwargs.copy()
            # Do not normalize reward for env used for evaluation
            if eval_env:
                if len(local_normalize_kwargs) > 0:
                    local_normalize_kwargs["norm_reward"] = False
                else:
                    local_normalize_kwargs = {"norm_reward": False}

            if self.verbose > 0:
                if len(local_normalize_kwargs) > 0:
                    print(f"Normalization activated: {local_normalize_kwargs}")
                else:
                    print("Normalizing input and reward")
            env.num_envs = self.n_envs
            env = VecNormalize(env, **local_normalize_kwargs)
        return env
Пример #20
0
def setup(args):
    bridge = Bridge()

    render_key = "renders" if 'CartPole' in args.env else "render"
    env_kwargs = {
        render_key: args.render,
        "adv_force": args.adv_force,
        "mass_percentage": args.mass_percentage,
        "friction_percentage": args.friction_percentage,
        "simple_reward": args.simple_reward,
    }

    env = make_vec_env(args.env,
                       env_kwargs=env_kwargs,
                       seed=args.seed,
                       monitor_dir=args.monitor_dir)

    if args.evaluate:
        env = VecNormalize.load(f'{args.pickle}-{args.envname}', env)
        prot_agent = PPO.load(f'{args.pickle}-{args.prot_name}', device='cpu')
        if prot_agent.seed != args.seed:
            logging.info(
                f'warning: {prot_agent.seed=} does not match { args.seed=}')

        if args.adversarial:
            adv_agent = PPO.load(args.adv_pickle, device='cpu')
            if adv_agent.seed != args.seed:
                logging.info(
                    f'warning: {adv_agent.seed=} does not match { args.seed=}')
        else:
            adv_agent = None
    else:
        env = VecNormalize(env)
        prot_logname = f'{args.logs}-{args.prot_name}' if args.logs else None
        prot_agent = PPO("MlpPolicy",
                         env,
                         verbose=args.verbose,
                         seed=args.seed,
                         tensorboard_log=prot_logname,
                         n_steps=args.N_steps,
                         is_protagonist=True,
                         bridge=bridge,
                         device='cpu')

        if args.adversarial:
            adv_logname = f'{args.logs}-{args.adv_name}' if args.logs else None
            adv_agent = PPO("MlpPolicy",
                            env,
                            verbose=args.verbose,
                            seed=args.seed,
                            tensorboard_log=adv_logname,
                            n_steps=args.N_steps,
                            is_protagonist=False,
                            bridge=bridge,
                            device='cpu')
        else:
            adv_agent = None

    bridge.link_agents(prot_agent, adv_agent)

    return prot_agent, adv_agent, env
Пример #21
0
def make_env(
    args,
    num_envs=None,
    include_norm=False,
    norm_reward=True,
    **kwargs,
):
    """Return a vectorized environment containing `num_envs` or `args.num_envs`
    environments (depending on whether `num_envs is None`).
    `args`, the command line arguments, specify several values. See `kwargs`
    for a more detailed explanation on their interaction.
    `include_norm` specifies whether the environment is wrapped in a
    normalizing environment.
    `norm_reward` indicates whether the rewards are normalized (only
    revelant if `include_norm is True`).
    `kwargs` are passed directly to the environment creation function. Any
    value given via `kwargs` has priority over the one given by `args`.
    """
    if num_envs is None:
        num_envs = args.num_envs

    # `kwargs` given via `args`
    args_kwargs = {}
    for arg in [
            'M',
            'dt',
            'restol',
            'lambda_real_interval',
            'lambda_imag_interval',
            'lambda_real_interpolation_interval',
            'norm_factor',
            'residual_weight',
            'step_penalty',
            'reward_iteration_only',
            'reward_strategy',
            'collect_states',
            'example',
    ]:
        args_kwargs[arg] = kwargs.pop(arg, getattr(args, arg))
    all_kwargs = {**kwargs, **args_kwargs}

    # SAC does not support float64
    if args.model_class == 'SAC':
        all_kwargs['use_doubles'] = False

    seed = all_kwargs.pop('seed', args.seed)

    def gym_make(i):
        return lambda: gym.make(
            args.envname,
            seed=seed + i if seed is not None else None,
            **all_kwargs,
        )

    env = DummyVecEnv([gym_make(i) for i in range(num_envs)])
    if include_norm:
        if hasattr(args, 'env_path') and args.env_path is not None:
            env = VecNormalize.load(str(Path(args.env_path)), env)
        else:
            # When training, set `norm_reward = True`, I hear...
            if 'gamma' in args.model_kwargs:
                env = VecNormalize(
                    env,
                    norm_obs=args.norm_obs,
                    norm_reward=norm_reward,
                    gamma=args.model_kwargs['gamma'],
                )
            else:
                env = VecNormalize(
                    env,
                    norm_obs=args.norm_obs,
                    norm_reward=norm_reward,
                )
    if debug_nans:
        env = VecCheckNan(env, raise_exception=True)
    return env
Пример #22
0
    save_model_folder = 'trained_models'
    save_model_filename = '2M_OSC_POSE'
    load_model_folder = 'trained_models'
    load_model_filename = '2M_OSC_POSE'

    save_model_path = os.path.join(save_model_folder, save_model_filename)
    save_vecnormalize_path = os.path.join(save_model_folder, 'vec_normalize_' + save_model_filename + '.pkl')
    load_model_path = os.path.join(load_model_folder, load_model_filename)
    load_vecnormalize_path = os.path.join(load_model_folder, 'vec_normalize_' + load_model_filename + '.pkl')

    if training:
        env = SubprocVecEnv([make_training_env(env_id, options, i) for i in range(num_cpu)])
        env = VecNormalize(env)

        if isinstance(load_model_for_training_path, str):
            env = VecNormalize.load(load_vecnormalize_for_training_path, env)
            model = PPO.load(load_model_for_training_path, env=env)
        else:
            model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log_folder)
        
        eval_env_func = make_training_env(env_id, options, rank=num_cpu)
        eval_env = DummyVecEnv([eval_env_func])
        eval_env = VecNormalize(eval_env)

        eval_callback = EvalCallback(eval_env, best_model_save_path='./best_models/',
                             log_path='./logs_best_model/',
                             deterministic=True, render=False, n_eval_episodes=10)

        model.learn(total_timesteps=training_timesteps, tb_log_name=tb_log_name, callback=eval_callback)

        model.save(save_model_path)
Пример #23
0
def normalize_env(
    env,
    orig_log_dir,
    sb_version,
    vectorize=True,
    continue_learning=False,
    evaluate=False,
    evaluate_during_learning=False,
    normalize_kwargs=None,
):
    if vectorize:
        env = DummyVecEnv([lambda: env])

    logger.debug("Normalize: {}".format(normalize_kwargs))
    if evaluate:
        # FIXME in continue learning training should be True so that we update the running average of obs and
        #  rewards with new samples; if I do that, the algo performs very poorly even with no changes in the env
        if sb_version == "sb3":
            env = VecNormalize3(env, training=False, **normalize_kwargs)
        else:
            env = VecNormalize(env, training=False, **normalize_kwargs)

        if not evaluate_during_learning or continue_learning:
            if not os.path.exists(
                    os.path.join(orig_log_dir, "vecnormalize.pkl")):
                env_name = get_env_name(env=env.unwrapped,
                                        sb_version=sb_version)
                index_last_separator = orig_log_dir.rindex("/")
                new_orig_log_dir = os.path.join(
                    orig_log_dir[0:index_last_separator], "logs_" + env_name)
                logger.debug(
                    "{} does not exist. Trying to search it in the original model directory {}"
                    .format(os.path.join(orig_log_dir, "vecnormalize.pkl"),
                            new_orig_log_dir))
                assert os.path.exists(new_orig_log_dir), "{} does not exist"
                assert os.path.exists(
                    os.path.join(new_orig_log_dir, "vecnormalize.pkl")), (
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl") +
                        " does not exist")
                logger.debug("[evaluate] Loading {}".format(
                    os.path.join(new_orig_log_dir, "vecnormalize.pkl")))
                if sb_version == "sb3":
                    env = VecNormalize3.load(
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl"),
                        env)
                else:
                    env = VecNormalize.load(
                        os.path.join(new_orig_log_dir, "vecnormalize.pkl"),
                        env)
            else:
                logger.debug("[evaluate] Loading {}".format(
                    os.path.join(orig_log_dir, "vecnormalize.pkl")))
                if sb_version == "sb3":
                    env = VecNormalize3.load(
                        os.path.join(orig_log_dir, "vecnormalize.pkl"), env)
                else:
                    env = VecNormalize.load(
                        os.path.join(orig_log_dir, "vecnormalize.pkl"), env)

        # Deactivate training and reward normalization
        env.training = False
        env.norm_reward = False

    elif continue_learning:
        # FIXME: don't know why but during continue learning I have to disable training otherwise performance
        #  is not the same as in the model trained from scratch even without changing the params of the environment.
        #  in rl-baselines-zoo this is not done during continue learning:
        #  https://github.com/araffin/rl-baselines-zoo/blob/master/train.py#L365
        if sb_version == "sb3":
            env = VecNormalize3(env, training=False, **normalize_kwargs)
        else:
            env = VecNormalize(env, training=False, **normalize_kwargs)

        assert os.path.exists(os.path.join(
            orig_log_dir, "vecnormalize.pkl")), (
                os.path.join(orig_log_dir, "vecnormalize.pkl") +
                " does not exist")
        logger.debug("[continue_learning] Loading {}".format(
            os.path.join(orig_log_dir, "vecnormalize.pkl")))
        if sb_version == "sb3":
            env = VecNormalize3.load(
                os.path.join(orig_log_dir, "vecnormalize.pkl"), env)
        else:
            env = VecNormalize.load(
                os.path.join(orig_log_dir, "vecnormalize.pkl"), env)

    else:
        if sb_version == "sb3":
            env = VecNormalize3(env, **normalize_kwargs)
        else:
            env = VecNormalize(env, **normalize_kwargs)

    return env
Пример #24
0
        if 'policy_kwargs' in hyperparams.keys():
            del hyperparams['policy_kwargs']

        model = ALGOS[args.algo].load(args.trained_agent,
                                      env=env,
                                      seed=args.seed,
                                      tensorboard_log=tensorboard_log,
                                      verbose=args.verbose,
                                      **hyperparams)

        exp_folder = args.trained_agent.split('.zip')[0]
        if normalize:
            print("Loading saved running average")
            stats_path = os.path.join(exp_folder, env_id)
            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
            else:
                # Legacy:
                env.load_running_average(exp_folder)

        replay_buffer_path = os.path.join(os.path.dirname(args.trained_agent),
                                          'replay_buffer.pkl')
        if os.path.exists(replay_buffer_path):
            print("Loading replay buffer")
            model.load_replay_buffer(replay_buffer_path)

    elif args.optimize_hyperparameters:

        if args.verbose > 0:
            print("Optimizing hyperparameters")
Пример #25
0
if custom_params['USING_VAE']:
    env = NormalizeWrapper(env)  # No need to use normalization if image
    env = FinalLayerObservationWrapper(env, latent_dim=1028, map="map3")

# Step 3.b. To make Vectorized Environment to be able to use Normalize or FramStack (Optional)
env = make_vec_env(lambda: env, n_envs=1)
# Step 3.b Passing through Normalization and stack frame (Optional)

env = VecFrameStack(
    env,
    n_stack=custom_params['FRAME_STACK'])  # Use 1 for now because we use image
if not custom_params['USING_VAE']:
    env = VecTransposeImage(env)  # Uncomment if using 3d obs
if custom_params['USING_NORMALIZATION']:
    env = VecNormalize.load(osp.join(results_dir, "vec_normalization.pkl"),
                            env)

# Load the agent
if custom_params['algo'] == 'sac':
    model = SAC.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'a2c':
    model = A2C.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'dqn':
    model = DQN.load(osp.join(results_dir, "best_model", "best_model.zip"))
elif custom_params['algo'] == 'ppo':
    model = PPO.load(osp.join(results_dir, "best_model", "best_model.zip"))

else:
    raise ValueError("Error model")

# Load the saved statistics
def main(args):
    policy_path = args.policy_path
    expert = PPO.load(policy_path)

    # Initialize environment for input standardization
    factory = EnvFactory(args.env)
    env = DummyVecEnv([factory.make_env])
    env = VecNormalize.load(args.stats_path, env)
    env.training = False

    states = []
    for i in np.arange(-10, 110):
        for j in np.arange(-3, 3, 0.05):
            states.append([i, j])
    states = np.stack(states)
    states_scaled = env.normalize_obs(states)
    states_tensor = torch.as_tensor(states_scaled).float()

    policy: ActorCriticPolicy = expert.policy.cpu()
    true_actions_tensor, _, _ = policy.forward(states_tensor,
                                               deterministic=True)
    features_tensor = policy.features_extractor.forward(states_tensor)
    shared_latents_tensor = policy.mlp_extractor.shared_net.forward(
        features_tensor)
    policy_latents_tensor_layer1 = policy.mlp_extractor.policy_net[0].forward(
        shared_latents_tensor)
    policy_latents_tensor_layer1_activated = policy.mlp_extractor.policy_net[
        1].forward(policy_latents_tensor_layer1)
    policy_latents_tensor_layer2 = policy.mlp_extractor.policy_net[2].forward(
        policy_latents_tensor_layer1_activated)
    policy_latents_tensor_layer2_activated = policy.mlp_extractor.policy_net[
        3].forward(policy_latents_tensor_layer2)
    actions_tensor = policy.action_net.forward(
        policy_latents_tensor_layer2_activated)

    assert actions_tensor.equal(true_actions_tensor)

    binary_embeddings_layer1 = policy_latents_tensor_layer1_activated > 0
    binary_embeddings_layer1 = binary_embeddings_layer1.cpu().detach().numpy()
    binary_embeddings_layer2 = policy_latents_tensor_layer2_activated > 0
    binary_embeddings_layer2 = binary_embeddings_layer2.cpu().detach().numpy()

    binary_embeddings = np.concatenate(
        [binary_embeddings_layer1, binary_embeddings_layer2],
        axis=1).astype(int)
    integer_embeddings = np.packbits(binary_embeddings,
                                     axis=1,
                                     bitorder="little")
    integer_embeddings = integer_embeddings @ (256**np.arange(
        integer_embeddings.shape[1]))  # to allow arbitrary number of bits

    # convert raw integer embeddings to 0, 1, 2, 3...
    # fast rendering of state cells via grid interpolation
    grid_x, grid_y = np.mgrid[-10:110:1000j, -3:3:1000j]
    z = griddata((states[:, 0], states[:, 1]),
                 integer_embeddings, (grid_x, grid_y),
                 method='nearest')

    # convert raw integer
    convert_raw_integer_to_colorhash = np.vectorize(lambda x: ColorHash(x).rgb)
    grid_z = np.array(convert_raw_integer_to_colorhash(z)).swapaxes(
        0, 1).swapaxes(1, 2)

    plt.figure()
    plt.imshow(grid_z, extent=[-10, 110, -3, 3], aspect='auto')
    plt.title("State Space Visualized")
    plt.xlabel("$x$")
    plt.ylabel("$\\dot x$")
    plt.show()
Пример #27
0
def create_test_env(env_id,
                    n_envs=1,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None,
                    env_kwargs=None):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor
    :return: (gym.Env)
    """
    # HACK to save logs
    # if log_dir is not None:
    #     os.environ["OPENAI_LOG_FORMAT"] = 'csv'
    #     os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
    #     os.makedirs(log_dir, exist_ok=True)
    #     logger.configure()

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)
    if 'env_wrapper' in hyperparams.keys():
        del hyperparams['env_wrapper']

    if n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv([
            make_env(env_id,
                     i,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs) for i in range(n_envs)
        ])
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = SubprocVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs)
        ])
    else:
        env = DummyVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     log_dir,
                     wrapper_class=env_wrapper,
                     env_kwargs=env_kwargs)
        ])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Пример #28
0
        else:

            # Join file paths
            continue_training_model_path = os.path.join(
                continue_training_model_folder,
                continue_training_model_filename)
            continue_training_vecnormalize_path = os.path.join(
                continue_training_model_folder,
                'vec_normalize_' + continue_training_model_filename + '.pkl')

            print(
                f"Continual training on model located at {continue_training_model_path}"
            )

            # Load normalized env
            env = VecNormalize.load(continue_training_vecnormalize_path, env)

            # Load model
            model = PPO.load(continue_training_model_path, env=env)

        # Training
        model.learn(total_timesteps=training_timesteps,
                    tb_log_name=tb_log_name,
                    callback=checkpoint_callback,
                    reset_num_timesteps=True)

        # Save trained model
        model.save(save_model_path)
        env.save(save_vecnormalize_path)

    else:
Пример #29
0
def create_test_env(
    env_id: str,
    n_envs: int = 1,
    stats_path: Optional[str] = None,
    seed: int = 0,
    log_dir: Optional[str] = None,
    should_render: bool = True,
    hyperparams: Optional[Dict[str, Any]] = None,
    env_kwargs: Optional[Dict[str, Any]] = None,
) -> VecEnv:
    """
    Create environment for testing a trained agent

    :param env_id:
    :param n_envs: number of processes
    :param stats_path: path to folder containing saved running averaged
    :param seed: Seed for random number generator
    :param log_dir: Where to log rewards
    :param should_render: For Pybullet env, display the GUI
    :param hyperparams: Additional hyperparams (ex: n_stack)
    :param env_kwargs: Optional keyword argument to pass to the env constructor
    :return:
    """
    # Avoid circular import
    from utils.exp_manager import ExperimentManager

    # Create the environment and wrap it if necessary
    env_wrapper = get_wrapper_class(hyperparams)

    hyperparams = {} if hyperparams is None else hyperparams

    if "env_wrapper" in hyperparams.keys():
        del hyperparams["env_wrapper"]

    vec_env_kwargs = {}
    vec_env_cls = DummyVecEnv
    if n_envs > 1 or (ExperimentManager.is_bullet(env_id) and should_render):
        # HACK: force SubprocVecEnv for Bullet env
        # as Pybullet envs does not follow gym.render() interface
        vec_env_cls = SubprocVecEnv
        # start_method = 'spawn' for thread safe

    env = make_vec_env(
        env_id,
        n_envs=n_envs,
        monitor_dir=log_dir,
        seed=seed,
        wrapper_class=env_wrapper,
        env_kwargs=env_kwargs,
        vec_env_cls=vec_env_cls,
        vec_env_kwargs=vec_env_kwargs,
    )

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams["normalize"]:
            print("Loading running average")
            print(f"with params: {hyperparams['normalize_kwargs']}")
            path_ = os.path.join(stats_path, "vecnormalize.pkl")
            if os.path.exists(path_):
                env = VecNormalize.load(path_, env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                raise ValueError(f"VecNormalize stats {path_} not found")

        n_stack = hyperparams.get("frame_stack", 0)
        if n_stack > 0:
            print(f"Stacking {n_stack} frames")
            env = VecFrameStack(env, n_stack)
    return env
Пример #30
0
def create_test_env(
    env_id, n_envs=1, stats_path=None, seed=0, log_dir="", should_render=True, hyperparams=None, env_kwargs=None
):
    """
    Create environment for testing a trained agent

    :param env_id: (str)
    :param n_envs: (int) number of processes
    :param stats_path: (str) path to folder containing saved running averaged
    :param seed: (int) Seed for random number generator
    :param log_dir: (str) Where to log rewards
    :param should_render: (bool) For Pybullet env, display the GUI
    :param hyperparams: (dict) Additional hyperparams (ex: n_stack)
    :param env_kwargs: (Dict[str, Any]) Optional keyword argument to pass to the env constructor
    :return: (gym.Env)
    """
    # HACK to save logs
    # if log_dir is not None:
    #     os.environ["OPENAI_LOG_FORMAT"] = 'csv'
    #     os.environ["OPENAI_LOGDIR"] = os.path.abspath(log_dir)
    #     os.makedirs(log_dir, exist_ok=True)
    #     logger.configure()

    # Clean hyperparams, so the dict can be pass to the model constructor
    if True:
        keys_to_delete = ["n_envs", "n_timesteps", "env_wrapper", "callback", "frame_stack"]
        for key in keys_to_delete:
            delete_key(hyperparams, key)

    if n_envs > 1:
        # start_method = 'spawn' for thread safe
        env = SubprocVecEnv(
            [make_env(env_id, i, seed, log_dir, env_kwargs=env_kwargs) for i in range(n_envs)]
        )
    # Pybullet envs does not follow gym.render() interface
    elif "Bullet" in env_id or "Walker2D" in env_id:
        # HACK: force SubprocVecEnv for Bullet env
        env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)])
    else:
        env = DummyVecEnv([make_env(env_id, 127, seed, log_dir, env_kwargs=env_kwargs)])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams["normalize"]:
            # print("Loading running average")
            # print("with params: {}".format(hyperparams["normalize_kwargs"]))
            path_ = os.path.join(stats_path, "vecnormalize.pkl")
            if os.path.exists(path_):
                env = VecNormalize.load(path_, env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                raise ValueError(f"VecNormalize stats {path_} not found")

        n_stack = hyperparams.get("frame_stack", 0)
        if n_stack > 0:
            print(f"Stacking {n_stack} frames")
            env = VecFrameStack(env, n_stack)
    return env