Пример #1
0
def test_monitor_load_results(tmp_path):
    """
    test load_results on log files produced by the monitor wrapper
    """
    tmp_path = str(tmp_path)
    env1 = gym.make("CartPole-v1")
    env1.seed(0)
    monitor_file1 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
    monitor_env1 = Monitor(env1, monitor_file1)

    monitor_files = get_monitor_files(tmp_path)
    assert len(monitor_files) == 1
    assert monitor_file1 in monitor_files

    monitor_env1.reset()
    episode_count1 = 0
    for _ in range(1000):
        _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample())
        if done:
            episode_count1 += 1
            monitor_env1.reset()

    results_size1 = len(load_results(os.path.join(tmp_path)).index)
    assert results_size1 == episode_count1

    env2 = gym.make("CartPole-v1")
    env2.seed(0)
    monitor_file2 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
    monitor_env2 = Monitor(env2, monitor_file2)
    monitor_files = get_monitor_files(tmp_path)
    assert len(monitor_files) == 2
    assert monitor_file1 in monitor_files
    assert monitor_file2 in monitor_files

    monitor_env2.reset()
    episode_count2 = 0
    for _ in range(1000):
        _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample())
        if done:
            episode_count2 += 1
            monitor_env2.reset()

    results_size2 = len(load_results(os.path.join(tmp_path)).index)

    assert results_size2 == (results_size1 + episode_count2)

    os.remove(monitor_file1)
    os.remove(monitor_file2)
Пример #2
0
def test_monitor():
    """
    test the monitor wrapper
    """
    env = gym.make("CartPole-v1")
    env.seed(0)
    mon_file = "/tmp/stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())
    menv = Monitor(env, mon_file)
    menv.reset()
    for _ in range(1000):
        _, _, done, _ = menv.step(0)
        if done:
            menv.reset()

    file_handler = open(mon_file, 'rt')

    firstline = file_handler.readline()
    assert firstline.startswith('#')
    metadata = json.loads(firstline[1:])
    assert metadata['env_id'] == "CartPole-v1"
    assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"

    last_logline = pandas.read_csv(file_handler, index_col=None)
    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
    file_handler.close()
    os.remove(mon_file)
def hardcode(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)

    print("Running episodes with hardcoded policy.")

    inc = 0
    done = False
    while inc < timesteps:
        obs = env.reset()
        while True:
            action = policy(obs)
            obs, _, done, _ = env.step(action)
            inc += 1
            if done:
                break

    env.close()
Пример #4
0
def main():
    """
    Runs the test
    """
    """
    Create an argparse.ArgumentParser for run_mujoco.py.

    :return:  (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False}

    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--play', default=False, action='store_true')
    return parse
    """
    env_id = 'UR5Gripper-v0'
    model_path = '/tmp/gym/trpo_mpi/'
    # args = mujoco_arg_parser().parse_args()
    # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
    # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path)
    env = gym.make(env_id)
    env = Monitor(env, model_path, allow_early_resets=True)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path)
    model = model.load(model_path + "trpo.pkl")
    model.learn(total_timesteps=int(1e5), callback=callback)
    model.save(model_path + "trpo.pkl")
    # tf_util.save_state(model_path)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(100):
        obs = env.reset()
        env.render()
        for i in range(200):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
Пример #5
0
def random_agent(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)

    print("Running episodes with random policy.")

    # initalize timestep counter
    inc = 0

    while inc < timesteps:
        obs = env.reset()
        while True:
            # choose a random action from action_space
            action = env.action_space.sample()
            obs, _, done, _ = env.step(action)
            inc += 1
            if done:
                break

    env.close()
Пример #6
0
# Create log dir for callback model saving
os.makedirs("./temp_models/", exist_ok=True)
env = Monitor(env, "./temp_models/", allow_early_resets=True)

##### TRAIN #####

if args.train:
    check_overwrite(args.model)
    model = SAC(MlpPolicy,
                env,
                verbose=1,
                tensorboard_log="./tensorboard_log/")
    model.learn(total_timesteps=int(args.step),
                log_interval=10,
                tb_log_name="log",
                callback=callback.callback)
    model.save(MODELS_FOLDER_PATH)

#### TEST #####

if not args.train:
    model = SAC.load(MODELS_FOLDER_PATH)
    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(scale_range(action, -1, 1, 0, 1))
        env.render()
        if done:
            obs = env.reset()
env = gym.make('UR5Gripper-v0')
# Create the vectorized environment
# env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
env = Monitor(env, log_dir, allow_early_resets=True)
# env = SubprocVecEnv([make_mujoco_env(env_id, i) for i in range(num_cpu)])
# env = SubprocVecEnv([lambda: env])
env = DummyVecEnv([lambda: env])

# env = SubprocVecEnv([lambda: gym.make('UR5Gripper-v0') for i in range(num_cpu)])

# Add some param noise for exploration
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                     desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
# model = DDPG(MlpPolicy, env, param_noise=param_noise, verbose=1, tensorboard_log=log_dir)
# model = PPO2(MlpPolicy, env, verbose=1)
# model = SAC(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_steps=1000)

# Train the agent
model.learn(total_timesteps=int(1e7), callback=callback)

mean_reward_after_train = evaluate(model, num_steps=1000)

obs = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Пример #8
0
def main():
    global save_path, log_dir, model, best_mean_reward
    mk_dir(args.checkpoint_dir + args.policy)
    save_path = args.checkpoint_dir + args.policy + "/" + args.policy
    log_dir = args.summary_dir + args.policy
    mk_dir(log_dir)
    env = gym.make("SegmentationEnv-v0",
                   objs_dir=args.objs_dir,
                   max_scenes=args.max_scenes,
                   sample_size=args.sample_size,
                   diff_punishment=args.diff_punishment,
                   max_steps_per_scene=args.max_steps_per_scene,
                   scene_mode=args.scene_mode,
                   point_mode=args.point_mode,
                   voxel_size=args.voxel_size,
                   voxel_mode=args.voxel_mode,
                   single_scenes=args.single_scenes,
                   early_diff=args.early_diff,
                   wall_weight=args.wall_weight)
    env = Monitor(env, log_dir, allow_early_resets=True)

    env = DummyVecEnv([
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    env = VecCheckNan(env, raise_exception=True)

    net_module = importlib.import_module(args.policy)
    model = PPO2(net_module.Policy,
                 env,
                 verbose=args.verbose,
                 tensorboard_log=log_dir,
                 learning_rate=args.learning_rate,
                 ent_coef=args.ent_coef,
                 cliprange=args.cliprange,
                 cliprange_vf=args.cliprange_vf,
                 lam=args.lam,
                 gamma=args.gamma,
                 seed=args.seed,
                 n_cpu_tf_sess=args.n_cpu_tf_sess,
                 noptepochs=args.noptepochs,
                 nminibatches=args.nminibatches,
                 n_steps=args.n_steps,
                 max_grad_norm=args.max_grad_norm)

    if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1:
        print("------------start pretrain------------")
        #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16)
        dataset = ExpertDataset(expert_path="expert_trajectories.npz",
                                special_shape=True,
                                train_fraction=args.train_fraction,
                                batch_size=args.pretrain_batch_size)
        #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000)
        model = model.pretrain(dataset,
                               val_interval=1,
                               learning_rate=args.pretrain_learning_rate,
                               n_epochs=args.pretrain_n_epochs)
        print("pretrain finished -- save model")
        model.save(save_path)
        returns = []

        print("Calculate mean reward")
        n_episodes = 10
        for i in range(n_episodes):
            total_reward = 0
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                total_reward += reward
                if done:
                    returns.append(total_reward)
                    break
        returns = np.array(returns)
        best_mean_reward = np.mean(returns)
        print("Best mean reward: {:.2f}".format(best_mean_reward))

    model.learn(total_timesteps=args.total_timesteps, callback=callback)
    env.close()
Пример #9
0
 def step(self, action):
     if self.render_gui and self.rank == self.render_rank:
         self.render()
     return Monitor.step(self, action)
Пример #10
0
import gym
import os
from stable_baselines.bench import Monitor
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

# ログフォルダの生成
log_dir = './logs/'
os.makedirs(log_dir, exist_ok=True)

# 環境の生成
env = gym.make('CartPole-v1')
env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

# モデルの生成
model = PPO2(MlpPolicy, env, verbose=1)

# モデルの学習
model.learn(total_timesteps=10000)

# モデルのテスト
state = env.reset()
for i in range(200):
    env.render()
    action, _ = model.predict(state)
    state, rewards, done, info = env.step(action)
    if done:
        break
Пример #11
0
import gym
import AI_test
from stable_baselines.common.env_checker import check_env
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.bench import Monitor
from stable_baselines.common.vec_env import DummyVecEnv

env = gym.make('AI_test:Ai-v0')
env = Monitor(env, filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])
model = ACKTR('MlpPolicy', env, verbose=1).learn(10000)

# Test the trained agent
obs = env.reset()
n_steps = 100
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print("Step {}".format(step + 1))
    print("Action: ", action)
    obs, reward, done, info = env.step(action)
    print('obs=', obs, 'reward=', reward, 'done=', done)
    env.render()
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break
Пример #12
0
    def step(self, action):
        if self.render_me:
            self.render()
        ret = Monitor.step(self, action)

        return ret
Пример #13
0
def main():
    #criando diretorio
    log_dir = "tmp/"
    os.makedirs(log_dir, exist_ok=True)
    #criando envs
    envRoll = gym.make('gym_foo:DroneRoll-v0')
    envRoll = Monitor(envRoll, log_dir)
    modelRoll = PPO2(MlpPolicy,
                     envRoll,
                     gamma=0.99,
                     n_steps=2048,
                     ent_coef=0.0,
                     learning_rate=3e-4,
                     lam=0.95,
                     nminibatches=32,
                     noptepochs=10,
                     cliprange=0.2,
                     verbose=1)
    envPitch = gym.make('gym_foo:DronePitch-v0')
    envPitch = Monitor(envPitch, log_dir)

    modelPitch = PPO2(MlpPolicy,
                      envPitch,
                      gamma=0.99,
                      n_steps=2048,
                      ent_coef=0.0,
                      learning_rate=3e-4,
                      lam=0.95,
                      nminibatches=32,
                      noptepochs=10,
                      cliprange=0.2,
                      verbose=1)
    envYaw = gym.make('gym_foo:DroneYaw-v0')
    envYaw = Monitor(envYaw, log_dir)

    modelYaw = PPO2(MlpPolicy,
                    envYaw,
                    gamma=0.99,
                    n_steps=2048,
                    ent_coef=0.0,
                    learning_rate=3e-4,
                    lam=0.95,
                    nminibatches=32,
                    noptepochs=10,
                    cliprange=0.2,
                    verbose=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    #treinando
    time_steps = 2e6
    modelRoll.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Roll")
    plt.show()

    modelPitch.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Pitch")
    plt.show()

    modelYaw.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Yaw")
    plt.show()

    #salvando modelos
    modelRoll.save("Drone_Roll_PPO_001")
    modelPitch.save("Drone_Pitch_PPO_001")
    modelYaw.save("Drone_Yaw_PPO_001")

    #Load modelo
    #model = PPO2.load("Drone_Roll_PPO_0.01")

    #testando gerando resposta no tempo
    T = [0]
    # Loop de teste
    t = 0
    #obs = env.reset()
    obsRoll = envRoll.reset()
    obsPitch = envPitch.reset()
    obsYaw = envYaw.reset()
    Roll = [envRoll.state[0]]
    Pitch = [envPitch.state[0]]
    Yaw = [envYaw.state[0]]

    #loop de simulação
    while t < 10:  # ate 10 segundos

        actionRoll, _states = modelRoll.predict(obsRoll)
        # Retrieve new state, reward, and whether the state is terminal
        obsRoll, reward, done, info = envRoll.step(actionRoll)
        Roll.append((180 / np.pi) * envRoll.state[0])

        actionPitch, _states = modelPitch.predict(obsPitch)
        # Retrieve new state, reward, and whether the state is terminal
        obsPitch, reward, done, info = envPitch.step(actionPitch)
        Pitch.append((180 / np.pi) * envPitch.state[0])

        actionYaw, _states = modelYaw.predict(obsYaw)
        # Retrieve new state, reward, and whether the state is terminal
        obsYaw, reward, done, info = envYaw.step(actionYaw)
        Yaw.append((180 / np.pi) * envYaw.state[0])

        t += 0.01
        T.append(t)

    #Plots
    plt.figure(1)
    plt.plot(T, Roll)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Roll')
    plt.xlabel('Time (seconds)')
    plt.title('Roll Response')
    plt.grid()
    plt.show()

    plt.figure(2)
    plt.plot(T, Pitch)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Pitch')
    plt.xlabel('Time (seconds)')
    plt.title('Pitch Response')
    plt.grid()
    plt.show()

    plt.figure(3)
    plt.plot(T, Yaw)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Yaw')
    plt.xlabel('Time (seconds)')
    plt.title('Yaw Response')
    plt.grid()
    plt.show()