Пример #1
0
def test_monitor():
    """
    test the monitor wrapper
    """
    env = gym.make("CartPole-v1")
    env.seed(0)
    mon_file = "/tmp/stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())
    menv = Monitor(env, mon_file)
    menv.reset()
    for _ in range(1000):
        _, _, done, _ = menv.step(0)
        if done:
            menv.reset()

    file_handler = open(mon_file, 'rt')

    firstline = file_handler.readline()
    assert firstline.startswith('#')
    metadata = json.loads(firstline[1:])
    assert metadata['env_id'] == "CartPole-v1"
    assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"

    last_logline = pandas.read_csv(file_handler, index_col=None)
    assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
    file_handler.close()
    os.remove(mon_file)
Пример #2
0
def test_monitor_load_results(tmp_path):
    """
    test load_results on log files produced by the monitor wrapper
    """
    tmp_path = str(tmp_path)
    env1 = gym.make("CartPole-v1")
    env1.seed(0)
    monitor_file1 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
    monitor_env1 = Monitor(env1, monitor_file1)

    monitor_files = get_monitor_files(tmp_path)
    assert len(monitor_files) == 1
    assert monitor_file1 in monitor_files

    monitor_env1.reset()
    episode_count1 = 0
    for _ in range(1000):
        _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample())
        if done:
            episode_count1 += 1
            monitor_env1.reset()

    results_size1 = len(load_results(os.path.join(tmp_path)).index)
    assert results_size1 == episode_count1

    env2 = gym.make("CartPole-v1")
    env2.seed(0)
    monitor_file2 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4()))
    monitor_env2 = Monitor(env2, monitor_file2)
    monitor_files = get_monitor_files(tmp_path)
    assert len(monitor_files) == 2
    assert monitor_file1 in monitor_files
    assert monitor_file2 in monitor_files

    monitor_env2.reset()
    episode_count2 = 0
    for _ in range(1000):
        _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample())
        if done:
            episode_count2 += 1
            monitor_env2.reset()

    results_size2 = len(load_results(os.path.join(tmp_path)).index)

    assert results_size2 == (results_size1 + episode_count2)

    os.remove(monitor_file1)
    os.remove(monitor_file2)
Пример #3
0
def main():
    """
    Runs the test
    """
    """
    Create an argparse.ArgumentParser for run_mujoco.py.

    :return:  (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False}

    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--play', default=False, action='store_true')
    return parse
    """
    env_id = 'UR5Gripper-v0'
    model_path = '/tmp/gym/trpo_mpi/'
    # args = mujoco_arg_parser().parse_args()
    # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
    # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path)
    env = gym.make(env_id)
    env = Monitor(env, model_path, allow_early_resets=True)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path)
    model = model.load(model_path + "trpo.pkl")
    model.learn(total_timesteps=int(1e5), callback=callback)
    model.save(model_path + "trpo.pkl")
    # tf_util.save_state(model_path)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(100):
        obs = env.reset()
        env.render()
        for i in range(200):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
def hardcode(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)

    print("Running episodes with hardcoded policy.")

    inc = 0
    done = False
    while inc < timesteps:
        obs = env.reset()
        while True:
            action = policy(obs)
            obs, _, done, _ = env.step(action)
            inc += 1
            if done:
                break

    env.close()
Пример #5
0
def random_agent(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)

    print("Running episodes with random policy.")

    # initalize timestep counter
    inc = 0

    while inc < timesteps:
        obs = env.reset()
        while True:
            # choose a random action from action_space
            action = env.action_space.sample()
            obs, _, done, _ = env.step(action)
            inc += 1
            if done:
                break

    env.close()
Пример #6
0
# Create log dir for callback model saving
os.makedirs("./temp_models/", exist_ok=True)
env = Monitor(env, "./temp_models/", allow_early_resets=True)

##### TRAIN #####

if args.train:
    check_overwrite(args.model)
    model = SAC(MlpPolicy,
                env,
                verbose=1,
                tensorboard_log="./tensorboard_log/")
    model.learn(total_timesteps=int(args.step),
                log_interval=10,
                tb_log_name="log",
                callback=callback.callback)
    model.save(MODELS_FOLDER_PATH)

#### TEST #####

if not args.train:
    model = SAC.load(MODELS_FOLDER_PATH)
    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(scale_range(action, -1, 1, 0, 1))
        env.render()
        if done:
            obs = env.reset()
Пример #7
0
def main():
    global save_path, log_dir, model, best_mean_reward
    mk_dir(args.checkpoint_dir + args.policy)
    save_path = args.checkpoint_dir + args.policy + "/" + args.policy
    log_dir = args.summary_dir + args.policy
    mk_dir(log_dir)
    env = gym.make("SegmentationEnv-v0",
                   objs_dir=args.objs_dir,
                   max_scenes=args.max_scenes,
                   sample_size=args.sample_size,
                   diff_punishment=args.diff_punishment,
                   max_steps_per_scene=args.max_steps_per_scene,
                   scene_mode=args.scene_mode,
                   point_mode=args.point_mode,
                   voxel_size=args.voxel_size,
                   voxel_mode=args.voxel_mode,
                   single_scenes=args.single_scenes,
                   early_diff=args.early_diff,
                   wall_weight=args.wall_weight)
    env = Monitor(env, log_dir, allow_early_resets=True)

    env = DummyVecEnv([
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    env = VecCheckNan(env, raise_exception=True)

    net_module = importlib.import_module(args.policy)
    model = PPO2(net_module.Policy,
                 env,
                 verbose=args.verbose,
                 tensorboard_log=log_dir,
                 learning_rate=args.learning_rate,
                 ent_coef=args.ent_coef,
                 cliprange=args.cliprange,
                 cliprange_vf=args.cliprange_vf,
                 lam=args.lam,
                 gamma=args.gamma,
                 seed=args.seed,
                 n_cpu_tf_sess=args.n_cpu_tf_sess,
                 noptepochs=args.noptepochs,
                 nminibatches=args.nminibatches,
                 n_steps=args.n_steps,
                 max_grad_norm=args.max_grad_norm)

    if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1:
        print("------------start pretrain------------")
        #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16)
        dataset = ExpertDataset(expert_path="expert_trajectories.npz",
                                special_shape=True,
                                train_fraction=args.train_fraction,
                                batch_size=args.pretrain_batch_size)
        #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000)
        model = model.pretrain(dataset,
                               val_interval=1,
                               learning_rate=args.pretrain_learning_rate,
                               n_epochs=args.pretrain_n_epochs)
        print("pretrain finished -- save model")
        model.save(save_path)
        returns = []

        print("Calculate mean reward")
        n_episodes = 10
        for i in range(n_episodes):
            total_reward = 0
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                total_reward += reward
                if done:
                    returns.append(total_reward)
                    break
        returns = np.array(returns)
        best_mean_reward = np.mean(returns)
        print("Best mean reward: {:.2f}".format(best_mean_reward))

    model.learn(total_timesteps=args.total_timesteps, callback=callback)
    env.close()
Пример #8
0
import gym
import os
from stable_baselines.bench import Monitor
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

# ログフォルダの生成
log_dir = './logs/'
os.makedirs(log_dir, exist_ok=True)

# 環境の生成
env = gym.make('CartPole-v1')
env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

# モデルの生成
model = PPO2(MlpPolicy, env, verbose=1)

# モデルの学習
model.learn(total_timesteps=10000)

# モデルのテスト
state = env.reset()
for i in range(200):
    env.render()
    action, _ = model.predict(state)
    state, rewards, done, info = env.step(action)
    if done:
        break
Пример #9
0
os.makedirs(log_dir, exist_ok=True)
b_program_settings["n_blue_cars"] = e["n"]
env = gym_env_generator(episode_timeout=30)
env = Monitor(env, log_dir)
policy_kwargs = dict(layers=e["layers"])
model = DQN("MlpPolicy",
            env,
            verbose=1,
            exploration_fraction=0.9,
            exploration_final_eps=0,
            learning_rate=0.001,
            learning_starts=100,
            policy_kwargs=policy_kwargs,
            double_q=e["double_q"],
            prioritized_replay=e["prioritized_replay"])

env = gym_env_generator(episode_timeout=100)
observation = env.reset()
print(observation)
observation = np.array(observation)
vectorized_env = model._is_vectorized_observation(observation,
                                                  model.observation_space)
observation = observation.reshape((-1, ) + model.observation_space.shape)
with model.sess.as_default():
    actions, a, b = model.step_model.step(observation, deterministic=True)
print(actions)
print(a)
print(b)
if not vectorized_env:
    actions = actions[0]
print(actions)
Пример #10
0
 def make_helper():
     env = gym.make(env_str)
     env.seed(cpu)
     env = Monitor(env, "models/", allow_early_resets=True)
     env.reset()
     return env
Пример #11
0
def main():
    #criando diretorio
    log_dir = "tmp/"
    os.makedirs(log_dir, exist_ok=True)
    #criando envs
    envRoll = gym.make('gym_foo:DroneRoll-v0')
    envRoll = Monitor(envRoll, log_dir)
    modelRoll = PPO2(MlpPolicy,
                     envRoll,
                     gamma=0.99,
                     n_steps=2048,
                     ent_coef=0.0,
                     learning_rate=3e-4,
                     lam=0.95,
                     nminibatches=32,
                     noptepochs=10,
                     cliprange=0.2,
                     verbose=1)
    envPitch = gym.make('gym_foo:DronePitch-v0')
    envPitch = Monitor(envPitch, log_dir)

    modelPitch = PPO2(MlpPolicy,
                      envPitch,
                      gamma=0.99,
                      n_steps=2048,
                      ent_coef=0.0,
                      learning_rate=3e-4,
                      lam=0.95,
                      nminibatches=32,
                      noptepochs=10,
                      cliprange=0.2,
                      verbose=1)
    envYaw = gym.make('gym_foo:DroneYaw-v0')
    envYaw = Monitor(envYaw, log_dir)

    modelYaw = PPO2(MlpPolicy,
                    envYaw,
                    gamma=0.99,
                    n_steps=2048,
                    ent_coef=0.0,
                    learning_rate=3e-4,
                    lam=0.95,
                    nminibatches=32,
                    noptepochs=10,
                    cliprange=0.2,
                    verbose=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    #treinando
    time_steps = 2e6
    modelRoll.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Roll")
    plt.show()

    modelPitch.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Pitch")
    plt.show()

    modelYaw.learn(total_timesteps=int(2e6), callback=callback)
    results_plotter.plot_results([log_dir], time_steps,
                                 results_plotter.X_TIMESTEPS, "PPO Yaw")
    plt.show()

    #salvando modelos
    modelRoll.save("Drone_Roll_PPO_001")
    modelPitch.save("Drone_Pitch_PPO_001")
    modelYaw.save("Drone_Yaw_PPO_001")

    #Load modelo
    #model = PPO2.load("Drone_Roll_PPO_0.01")

    #testando gerando resposta no tempo
    T = [0]
    # Loop de teste
    t = 0
    #obs = env.reset()
    obsRoll = envRoll.reset()
    obsPitch = envPitch.reset()
    obsYaw = envYaw.reset()
    Roll = [envRoll.state[0]]
    Pitch = [envPitch.state[0]]
    Yaw = [envYaw.state[0]]

    #loop de simulação
    while t < 10:  # ate 10 segundos

        actionRoll, _states = modelRoll.predict(obsRoll)
        # Retrieve new state, reward, and whether the state is terminal
        obsRoll, reward, done, info = envRoll.step(actionRoll)
        Roll.append((180 / np.pi) * envRoll.state[0])

        actionPitch, _states = modelPitch.predict(obsPitch)
        # Retrieve new state, reward, and whether the state is terminal
        obsPitch, reward, done, info = envPitch.step(actionPitch)
        Pitch.append((180 / np.pi) * envPitch.state[0])

        actionYaw, _states = modelYaw.predict(obsYaw)
        # Retrieve new state, reward, and whether the state is terminal
        obsYaw, reward, done, info = envYaw.step(actionYaw)
        Yaw.append((180 / np.pi) * envYaw.state[0])

        t += 0.01
        T.append(t)

    #Plots
    plt.figure(1)
    plt.plot(T, Roll)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Roll')
    plt.xlabel('Time (seconds)')
    plt.title('Roll Response')
    plt.grid()
    plt.show()

    plt.figure(2)
    plt.plot(T, Pitch)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Pitch')
    plt.xlabel('Time (seconds)')
    plt.title('Pitch Response')
    plt.grid()
    plt.show()

    plt.figure(3)
    plt.plot(T, Yaw)
    plt.yticks(np.arange(0, 190, 10))
    plt.ylabel('Yaw')
    plt.xlabel('Time (seconds)')
    plt.title('Yaw Response')
    plt.grid()
    plt.show()