Пример #1
0
def test_custom_vec_env(tmp_path):
    """
    Stand alone test for a special case (passing a custom VecEnv class) to avoid doubling the number of tests.
    """
    monitor_dir = tmp_path / "test_make_vec_env/"
    env = make_vec_env(
        "CartPole-v1",
        n_envs=1,
        monitor_dir=monitor_dir,
        seed=0,
        vec_env_cls=SubprocVecEnv,
        vec_env_kwargs={"start_method": None},
    )

    assert env.num_envs == 1
    assert isinstance(env, SubprocVecEnv)
    assert os.path.isdir(monitor_dir)
    # Kill subprocess
    env.close()
    # Cleanup folder
    shutil.rmtree(monitor_dir)

    # This should fail because DummyVecEnv does not have any keyword argument
    with pytest.raises(TypeError):
        make_vec_env("CartPole-v1", n_envs=1, vec_env_kwargs={"dummy": False})
Пример #2
0
def main():
    base_args, base_parser = get_logger2_args()
    args = get_args(base_parser)
    args.device = init_gpus_and_randomness(args.seed, args.gpu)
    logger = Logger2('/tmp/tmp', use_tensorboardX=True)
    logger.log_tb_object(args, 'args')
    envs = make_vec_env(args.env_name,
                        n_envs=args.num_envs,
                        vec_env_cls=SubprocVecEnv)
    viz_env = None
    if args.visualize:
        nm_core, nm_vrsn, = args.env_name.split('-')
        nm_core += 'Viz' if args.visualize else 'Dbg' if args.debug else ''
        viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1)
    rl_learner = PPO('MlpPolicy',
                     envs,
                     verbose=1,
                     seed=args.seed,
                     device='cpu')
    for epoch in range(args.num_epochs):
        rl_learner.learn(args.steps_per_epoch)
        if args.visualize:
            obs = viz_env.reset()
            done = False
            while not done:
                act, _ = rl_learner.predict(obs)
                if len(act.shape) > len(viz_env.action_space.shape):
                    act = act[0:1]  # just one viz env
                obs, rwd, done, _ = viz_env.step(act)
                time.sleep(0.01)  # to make motions visible
Пример #3
0
def main(args):
    envs = make_vec_env(args.env_name,
                        n_envs=args.num_envs,
                        vec_env_cls=SubprocVecEnv)
    viz_env = None
    if args.viz:
        nm_core, nm_vrsn, = args.env_name.split('-')
        nm_core += 'Viz' if args.viz else 'Dbg' if args.debug else ''
        viz_env = make_vec_env(nm_core + '-' + nm_vrsn, n_envs=1)
    rl_learner = PPO('MlpPolicy',
                     envs,
                     verbose=1,
                     seed=args.seed,
                     device='cpu')
    for epoch in range(args.num_epochs):
        rl_learner.learn(args.steps_per_epoch)
        if args.viz:
            obs = viz_env.reset()
            done = False
            while not done:
                act, _ = rl_learner.predict(obs)
                if len(act.shape) > len(viz_env.action_space.shape):
                    act = act[0:1]  # just one viz env
                obs, rwd, done, _ = viz_env.step(act)
                time.sleep(0.01)  # to make motions visible
Пример #4
0
    def __init__(self, args):
        self.envName = args.envName
        self.numEnvs = args.numEnvs
        self.algo = args.algo
        self.timeSteps = args.timeSteps
        self.saveDir = args.saveDir

        # if self.algo in ['PPO','A2C']:
        self.env = make_vec_env(self.envName, n_envs=self.numEnvs)
Пример #5
0
def main():
    args = parse_arguments()
    load_path = os.path.join("logs", args.env, args.agent, "best_model.zip")
    stats_path = os.path.join(args.log_dir, args.env, args.agent, "vec_normalize.pkl")

    if args.agent == 'ddpg':
        from stable_baselines3 import DDPG
        model = DDPG.load(load_path)
    elif args.agent == 'td3':
        from stable_baselines3 import TD3
        model = TD3.load(load_path)
    elif args.agent == 'ppo':
        from stable_baselines3 import PPO
        model = PPO.load(load_path)

    env = make_vec_env(args.env, n_envs=1)
    env = VecNormalize.load(stats_path, env)
    #  do not update them at test time
    env.training = False
    # reward normalization is not needed at test time
    env.norm_reward = False
    
    # env = gym.make(args.env)
    img = []
    if args.render:
        env.render('human')
    done = False
    obs = env.reset()
    action = model.predict(obs)
    if args.gif:
        img.append(env.render('rgb_array'))

    if args.timesteps is None:
        while not done: 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()
    else:
        for i in range(args.timesteps): 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()

    if args.gif:
        imageio.mimsave(f'{os.path.join("logs", args.env, args.agent, "recording.gif")}', [np.array(img) for i, img in enumerate(img) if i%2 == 0], fps=29)
Пример #6
0
def test_make_vec_env(env_id, n_envs, vec_env_cls, wrapper_class):
    env = make_vec_env(env_id, n_envs, vec_env_cls=vec_env_cls, wrapper_class=wrapper_class, monitor_dir=None, seed=0)

    assert env.num_envs == n_envs

    if vec_env_cls is None:
        assert isinstance(env, DummyVecEnv)
        if wrapper_class is not None:
            assert isinstance(env.envs[0], wrapper_class)
        else:
            assert isinstance(env.envs[0], Monitor)
    else:
        assert isinstance(env, SubprocVecEnv)
    # Kill subprocesses
    env.close()
Пример #7
0
    def __init__(self,
                 rap,
                 log_dir="/tmp/gym",
                 training_config=None,
                 algorithm="A2C",
                 checkpoint_results=None):
        super(ResourceManager,
              self).__init__(rap,
                             log_dir=log_dir,
                             algorithm=algorithm,
                             checkpoint_results=checkpoint_results)

        self.model_name = rap["name"] + "_baseline"

        self.environment = ResourceAllocationEnvironment(self.ra_problem)
        check_env(self.environment, warn=True)

        self.vector_environment = make_vec_env(lambda: self.environment,
                                               n_envs=1,
                                               monitor_dir=self.log_dir)

        self.training_steps = training_config["stage1_training_steps"]
Пример #8
0
        def train(config, checkpoint_dir=None):

            if model_keys:
                for key, path in zip(model_keys, model_paths):
                    submodel = PPO.load(path)
                    environment_kwargs["lower_lvl_models"][key] = submodel
            if "stage1_models" in policy_kwargs:
                for path in model_paths:
                    stage1_model = PPO.load(path)
                    policy_kwargs["stage1_models"].append(stage1_model)

            environment = environment_class(ra_problem, **environment_kwargs)
            vector_environment = make_vec_env(lambda: environment,
                                              n_envs=1,
                                              monitor_dir=log_dir)

            rewards = []

            for n in range(hpsearch_iterations):
                model = PPO(policy,
                            vector_environment,
                            verbose=1,
                            tensorboard_log=log_dir,
                            learning_rate=config["learning_rate"],
                            ent_coef=config["ent_coef"],
                            max_grad_norm=config["max_grad_norm"],
                            policy_kwargs=policy_kwargs)

                model.learn(total_timesteps=training_steps)

                reward = evaluate_policy(model,
                                         vector_environment,
                                         n_eval_episodes=100)[0]
                rewards.append(reward)

            tune.report(reward=np.mean(rewards), std=np.std(rewards))
Пример #9
0
def test_callbacks(tmp_path, model_class):
    log_folder = tmp_path / "logs/callbacks/"

    # DQN only support discrete actions
    env_name = select_env(model_class)
    # Create RL model
    # Small network for fast test
    model = model_class("MlpPolicy", env_name, policy_kwargs=dict(net_arch=[32]))

    checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=log_folder)

    eval_env = gym.make(env_name)
    # Stop training if the performance is good enough
    callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=-1200, verbose=1)

    eval_callback = EvalCallback(
        eval_env, callback_on_new_best=callback_on_best, best_model_save_path=log_folder, log_path=log_folder, eval_freq=100
    )
    # Equivalent to the `checkpoint_callback`
    # but here in an event-driven manner
    checkpoint_on_event = CheckpointCallback(save_freq=1, save_path=log_folder, name_prefix="event")

    event_callback = EveryNTimesteps(n_steps=500, callback=checkpoint_on_event)

    # Stop training if max number of episodes is reached
    callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=100, verbose=1)

    callback = CallbackList([checkpoint_callback, eval_callback, event_callback, callback_max_episodes])
    model.learn(500, callback=callback)

    # Check access to local variables
    assert model.env.observation_space.contains(callback.locals["new_obs"][0])
    # Check that the child callback was called
    assert checkpoint_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert event_callback.locals["new_obs"] is callback.locals["new_obs"]
    assert checkpoint_on_event.locals["new_obs"] is callback.locals["new_obs"]
    # Check that internal callback counters match models' counters
    assert event_callback.num_timesteps == model.num_timesteps
    assert event_callback.n_calls == model.num_timesteps

    model.learn(500, callback=None)
    # Transform callback into a callback list automatically
    model.learn(500, callback=[checkpoint_callback, eval_callback])
    # Automatic wrapping, old way of doing callbacks
    model.learn(500, callback=lambda _locals, _globals: True)

    # Testing models that support multiple envs
    if model_class in [A2C, PPO]:
        max_episodes = 1
        n_envs = 2
        # Pendulum-v0 has a timelimit of 200 timesteps
        max_episode_length = 200
        envs = make_vec_env(env_name, n_envs=n_envs, seed=0)

        model = model_class("MlpPolicy", envs, policy_kwargs=dict(net_arch=[32]))

        callback_max_episodes = StopTrainingOnMaxEpisodes(max_episodes=max_episodes, verbose=1)
        callback = CallbackList([callback_max_episodes])
        model.learn(1000, callback=callback)

        # Check that the actual number of episodes and timesteps per env matches the expected one
        episodes_per_env = callback_max_episodes.n_episodes // n_envs
        assert episodes_per_env == max_episodes
        timesteps_per_env = model.num_timesteps // n_envs
        assert timesteps_per_env == max_episode_length

    if os.path.exists(log_folder):
        shutil.rmtree(log_folder)
Пример #10
0
        print(
            "[ERROR] The selected algorithm does not support multiple environments"
        )
        exit()

    #### Uncomment to debug slurm scripts ######################
    # exit()

    env_name = ARGS.env + "-aviary-v0"
    sa_env_kwargs = dict(aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS,
                         obs=ARGS.obs,
                         act=ARGS.act)
    # train_env = gym.make(env_name, aggregate_phy_steps=shared_constants.AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act) # single environment instead of a vectorized one
    if env_name == "takeoff-aviary-v0":
        train_env = make_vec_env(TakeoffAviary,
                                 env_kwargs=sa_env_kwargs,
                                 n_envs=ARGS.cpu,
                                 seed=0)
    if env_name == "hover-aviary-v0":
        train_env = make_vec_env(HoverAviary,
                                 env_kwargs=sa_env_kwargs,
                                 n_envs=ARGS.cpu,
                                 seed=0)
    if env_name == "flythrugate-aviary-v0":
        train_env = make_vec_env(FlyThruGateAviary,
                                 env_kwargs=sa_env_kwargs,
                                 n_envs=ARGS.cpu,
                                 seed=0)
    print("[INFO] Action space:", train_env.action_space)
    print("[INFO] Observation space:", train_env.observation_space)
    # check_env(train_env, warn=True, skip_render_check=True)
Пример #11
0
def run(train_freq,
        gradient_steps,
        batch_size,
        envname,
        n_envs,
        log_interval,
        learning_rate,
        buffer_size,
        tau,
        gamma,
        target_policy_noise,
        target_noise_clip,
        learning_starts,
        total_timesteps,
        policy_kwargs,
        action_noise_mean,
        action_noise_sigma,
        noise_type,
        eval_freq,
        n_eval_episodes,
        verbose=True,
        tensorboard_log="logs/"):

    # Normalize with multi environments
    eval_freq = max(eval_freq // n_envs, 1)
    buffer_size = max(buffer_size // n_envs, 1)

    all_args = locals()

    path = "/" + os.path.join(*sb3.__file__.split("/")[:-2])
    commit_num = subprocess.check_output(["git", "describe", "--always"],
                                         cwd=path).strip().decode()

    env = gym.make(envname)
    vecenv = make_vec_env(envname, vec_env_cls=SubprocVecEnv, n_envs=n_envs)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    if noise_type == "OU":
        base_noise_class = OrnsteinUhlenbeckActionNoise
    elif noise_type == "Normal":
        base_noise_class = NormalActionNoise
    base_noise = base_noise_class(mean=np.ones(n_actions) * action_noise_mean,
                                  sigma=action_noise_sigma *
                                  np.ones(n_actions))
    action_noise = VectorizedActionNoise(base_noise, vecenv.num_envs)

    # Callbacks
    loggercallback = LoggerCallback("json", [("arguments", all_args),
                                             ("git", commit_num)])
    evalcallback = EvalCallback(make_vec_env(envname,
                                             vec_env_cls=SubprocVecEnv),
                                n_eval_episodes=n_eval_episodes,
                                eval_freq=eval_freq)

    # Initiate the model and start learning
    model = TD3("MlpPolicy",
                vecenv,
                action_noise=action_noise,
                batch_size=batch_size,
                train_freq=train_freq,
                gradient_steps=gradient_steps,
                learning_starts=learning_starts,
                n_episodes_rollout=-1,
                learning_rate=learning_rate,
                buffer_size=buffer_size,
                tau=tau,
                gamma=gamma,
                create_eval_env=True,
                target_policy_noise=target_policy_noise,
                target_noise_clip=target_noise_clip,
                verbose=verbose,
                policy_kwargs=policy_kwargs,
                tensorboard_log=tensorboard_log,
                device="cuda")
    model.learn(
        total_timesteps=total_timesteps,
        log_interval=log_interval,
        callback=[loggercallback, evalcallback],
        tb_log_name=envname,
    )
    model.env.close()
    evalcallback.eval_env.close()

    return evalcallback.best_mean_reward
Пример #12
0
    if ARGS.algo in ['sac', 'td3', 'ddpg'] and ARGS.cpu != 1:
        print(
            "[ERROR] The selected algorithm does not support multiple environments"
        )
        exit()

    #### Uncomment to debug slurm scripts ##############################################################
    # exit()

    env_name = ARGS.env + "-aviary-v0"
    # train_env = gym.make(env_name, aggregate_phy_steps=AGGR_PHY_STEPS, obs=ARGS.obs, act=ARGS.act) # single environment instead of a vectorized one
    if env_name == "takeoff-aviary-v0":
        train_env = make_vec_env(TakeoffAviary,
                                 env_kwargs=dict(
                                     aggregate_phy_steps=AGGR_PHY_STEPS,
                                     obs=ARGS.obs,
                                     act=ARGS.act),
                                 n_envs=ARGS.cpu,
                                 seed=0)
    if env_name == "hover-aviary-v0":
        train_env = make_vec_env(HoverAviary,
                                 env_kwargs=dict(
                                     aggregate_phy_steps=AGGR_PHY_STEPS,
                                     obs=ARGS.obs,
                                     act=ARGS.act),
                                 n_envs=ARGS.cpu,
                                 seed=0)
    if env_name == "flythrugate-aviary-v0":
        train_env = make_vec_env(FlyThruGateAviary,
                                 env_kwargs=dict(
                                     aggregate_phy_steps=AGGR_PHY_STEPS,
Пример #13
0
def test_vec_env_kwargs():
    env = make_vec_env("MountainCarContinuous-v0",
                       n_envs=1,
                       seed=0,
                       env_kwargs={"goal_velocity": 0.11})
    assert env.get_attr("goal_velocity")[0] == 0.11
Пример #14
0
# LICENSE file in the root directory of this source tree.
# Change stable_baseline to stable_basleine3 if you are using the newer version
from ABC_Env_CSC2547 import ABCEnv
import gym
import numpy as np

env = ABCEnv()

from stable_baselines3 import DQN, PPO, A2C
from stable_baselines3.common.cmd_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

# Instantiate the env
env = ABCEnv()
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

# Train the agent
"""
Something you might want to play around with, learning_rate, total timesteps etc.. 
Always choose a sample efficient algorithm
"""
total_timesteps = 200
model = DQN('MlpPolicy',
            env,
            verbose=1,
            tensorboard_log="./CSC2547_tensorboard/")
model.learn(total_timesteps)

model_name = "DQN_timesteps_" + str(total_timesteps)
model.save(model_name)
Пример #15
0
def make_env(env_id, n_envs, vec_env_cls=SubprocVecEnv):
    env = make_vec_env(env_id, n_envs, vec_env_cls=SubprocVecEnv)
    env = VecNormalize(env, norm_reward=True)
    return env
Пример #16
0
import os

import pybullet_envs
from pybullet_envs.stable_baselines.utils import TimeFeatureWrapper
from torch import nn

from stable_baselines3 import PPO
from stable_baselines3.common.cmd_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize

env = make_vec_env("HopperBulletEnv-v0",
                   n_envs=8,
                   wrapper_class=TimeFeatureWrapper)

env = VecNormalize(
    env,
    norm_obs=True,
    norm_reward=True,
    clip_obs=10.,
)

model = PPO('MlpPolicy',
            env,
            verbose=1,
            tensorboard_log="hopper_ppo",
            batch_size=128,
            n_steps=512,
            gamma=0.99,
            gae_lambda=0.92,
            alpha=99 * 2.,
            beta=1 * 2.,
Пример #17
0
        self.pbar = tqdm(total=self.total_timesteps)

        return ProgressBarCallback(self.pbar)

    def __exit__(self, exc_type, exc_val, exc_tb):  # close the callback
        self.pbar.n = self.total_timesteps
        self.pbar.update(0)
        self.pbar.close()


# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
env = make_vec_env('foo-v0', n_envs=1, monitor_dir=log_dir)
tic = time.perf_counter()

# Create callbacks
save_callback = SaveOnBestTrainingRewardCallback(check_freq=10000, log_dir=log_dir)


model = stable_baselines3.DQN('MlpPolicy', env, verbose=0, learning_rate=1e-4)
model = model.load('AAA', env)


steps = 10e6
with ProgressBarManager(steps) as progress_callback:
    # This is equivalent to callback=CallbackList([progress_callback, auto_save_callback])
    model = model.learn(steps, callback=[progress_callback, save_callback])
model.save('AAA')
Пример #18
0
            raise ValueError(f'Unrecognized action {action}')

        self._state = np.clip(self._state, 0, self._grid_size - 1)
        done = bool(self._state == self._grid_size - 1)
        reward = 1 if done else 0
        return np.array([self._state]).astype(np.float32), reward, done, {}

    def reset(self):
        self._state = 0
        return np.array([self._state]).astype(np.float32)

    def render(self, mode='human'):
        pass


if __name__ == '__main__':
    check_env(GridWorld(10))
    env = make_vec_env(lambda: GridWorld(10), n_envs=1)

    model = PPO('MlpPolicy', env, verbose=1).learn(5000)

    state = env.reset()
    for _ in range(20):
        action, _ = model.predict(state, deterministic=True)
        # action = 0
        next_state, reward, done, info = env.step(action)
        print(f'{state} -> {action} -> {next_state}: {reward}')
        state = next_state
        if done:
            break
Пример #19
0
from stable_baselines3.a2c import MlpPolicy
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.policies import ActorCriticCnnPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import numpy as np
from model import BehaviorCloneNet, CarModel
from logloader import LogLoader
import time
from torchvision.transforms import Compose, ToTensor, Normalize
from custom_arch import CustomCNN, CustomActorCriticPolicy

env = make_vec_env(DeepwatchEnv2)

policy_kwargs = dict(features_extractor_class=CustomCNN)
#check_env(env)

n_actions = env.action_space.shape[-1]

action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=0.1 * np.ones(n_actions))

#model = TD3(CnnPolicy, env, action_noise=action_noise, buffer_size=50000, verbose=1) # optimize_memory_usage=True
#model = SAC(CnnPolicy, env, buffer_size=50000, action_noise=action_noise, learning_rate=0.0005, tensorboard_log='./tensorboard', verbose=1)
#model = SAC.load("deepwatch_evolution_sac_7", env)
model = A2C(MlpPolicy, env, verbose=1,
            n_steps=5)  #, policy_kwargs=policy_kwargs)
model.load("deepwatch_evolution_a2c_2")
Пример #20
0
# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
# -----------------------------------------------------------
# environments setup
#env = gym.make('Pendulum-v0')
#env = Monitor(env, log_dir)

# env_string = 'LunarLander-v2'
# env = make_vec_env(env_string, n_envs=1,  monitor_dir=log_dir)  # Parallel environments
# eval_env = gym.make(env_string)

env_string = 'ransim-v0'
env = make_vec_env(env_string,
                   env_kwargs={"t_final": 5000},
                   n_envs=1,
                   monitor_dir=log_dir)  # Parallel environments
eval_env = gym.make(env_string, t_final=5000)
# -------------------------------------------------------------------------
# Use deterministic actions for evaluation
eval_callback = EvalCallback(eval_env,
                             best_model_save_path='./logs/',
                             log_path='./logs/',
                             eval_freq=200,
                             deterministic=True,
                             render=False)

ransim_callback = CustomRansimCallback(eval_env,
                                       best_model_save_path='./logs/',
                                       log_path='./logs/',
                                       eval_freq=10 * 5 * 1e2,