def test_ddpg_normalization():
    """
    Test that observations and returns normalizations are properly saved and loaded.
    """
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05,
                                         desired_action_stddev=0.05)
    model = DDPG('MlpPolicy',
                 'Pendulum-v0',
                 memory_limit=50000,
                 normalize_observations=True,
                 normalize_returns=True,
                 nb_rollout_steps=128,
                 nb_train_steps=1,
                 batch_size=64,
                 param_noise=param_noise)
    model.learn(1000)
    obs_rms_params = model.sess.run(model.obs_rms_params)
    ret_rms_params = model.sess.run(model.ret_rms_params)
    model.save('./test_ddpg')

    loaded_model = DDPG.load("test_ddpg")
    obs_rms_params_2 = loaded_model.sess.run(loaded_model.obs_rms_params)
    ret_rms_params_2 = loaded_model.sess.run(loaded_model.ret_rms_params)

    for param, param_loaded in zip(obs_rms_params + ret_rms_params,
                                   obs_rms_params_2 + ret_rms_params_2):
        assert np.allclose(param, param_loaded)

    del model, loaded_model

    if os.path.exists("./test_ddpg"):
        os.remove("./test_ddpg")
示例#2
0
import gym
from stable_baselines.ddpg import LnMlpPolicy
from stable_baselines.ddpg import DDPG
env = gym.make('HalfCheetah-v3')
model = DDPG(LnMlpPolicy, env,gamma=.95,buffer_size=1000000,param_noise_adaption_interval=0.22,batch_size=256,
			 normalize_observations=True,normalize_returns=False, policy_kwargs= dict(layers=[400, 300]) ,verbose=1)
model.learn(total_timesteps=1000000)
model.save('Cheetah_model_DDPG')
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()
示例#3
0
def main(env,
         load,
         save_path,
         load_path=None,
         train_timesteps=1.25e6,
         eval_timesteps=5e3):

    # arguments
    print(
        "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;"
        % (env, load, save_path, load_path, train_timesteps, eval_timesteps))
    train_timesteps = int(float(train_timesteps))
    eval_timesteps = int(float(eval_timesteps))

    # models path
    model_dir = os.getcwd() + "/models/"
    os.makedirs(model_dir, exist_ok=True)

    # logging path
    log_dir = os.getcwd() + "/log/" + save_path
    os.makedirs(log_dir, exist_ok=True)

    # absolute save path and models path
    save_path = model_dir + save_path
    if load and not load_path:
        print("no load path given, exiting...")
        sys.exit()
    elif load:
        load_path = model_dir + load_path

    # make environment, flattened environment, monitor, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(
        env, ['observation', 'achieved_goal', 'desired_goal'])
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # load model, or start from scratch
    if load:
        print("loading model from: " + load_path)
        model = DDPG.load(load_path, env=env)
    else:
        print("training model from scratch")
        model = DDPG(MlpPolicy, env, verbose=1)

    # evaluate current model
    mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps)

    # train model
    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0
    model.learn(total_timesteps=train_timesteps, callback=None)

    # save model
    print("saving model to:" + save_path)
    model.save(save_path)

    # evaluate post training model
    mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps)

    # results
    print("reward before training:" + str(mean_reward_before_train))
    print("reward after training:" + str(mean_reward_after_train))
    print("done")
示例#4
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 100000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()

    if layer_norm:
        policy = LnMlpPolicy
    else:
        policy = MlpPolicy

    num_timesteps = kwargs['num_timesteps']
    del kwargs['num_timesteps']

    model = DDPG(policy=policy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 verbose=1,
                 **kwargs)
    print("Learning started")  #

    model.learn(total_timesteps=num_timesteps)
    print("Saving model to LunarLanderCont_ddpg.pkl")  #
    model.save("LunarLanderCont_ddpg_2.pkl")  #
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
示例#5
0
def train(env_id, num_timesteps, seed, model_path=None, images=False):
    """
    Train PPO2 model for Mujoco environment, for testing purposes

    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """
    def make_env():
        if images:
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_object_obs=False,
                    use_camera_obs=True,  # do not use pixel observations
                    has_offscreen_renderer=
                    True,  # not needed since not using pixel obs
                    has_renderer=False,  # make sure we can render to the screen
                    camera_depth=True,
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                    render_visual_mesh=False,
                ),
                keys=["image", "depth"],
                images=True,
            )
        else:
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_object_obs=True,
                    use_camera_obs=False,  # do not use pixel observations
                    has_offscreen_renderer=
                    False,  # not needed since not using pixel obs
                    has_renderer=False,  # make sure we can render to the screen
                    camera_depth=False,
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                    render_visual_mesh=False,
                )  #, keys=["image", "depth"], images=True,
            )
        env_out.reward_range = None
        env_out.metadata = None
        env_out.spec = None
        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out

    #env = make_env()

    if images:
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        set_global_seeds(seed)
        policy = CnnPolicy
        tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/"
    else:
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        set_global_seeds(seed)
        policy = MlpPolicy
        tblog = "/cvgl2/u/surajn/workspace/tb_logs/sawyerlift_all/"
    nb_actions = env.action_space.shape[-1]
    #model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10,
    #             ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1, tensorboard_log=tblog)
    #model = TRPO(policy=policy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0,
    #                 gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, tensorboard_log=tblog, verbose=1)
    model = DDPG(policy=ddpgMlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=None,
                 param_noise=AdaptiveParamNoiseSpec(initial_stddev=0.2,
                                                    desired_action_stddev=0.2),
                 action_noise=OrnsteinUhlenbeckActionNoise(
                     mean=np.zeros(nb_actions),
                     sigma=float(0.2) * np.ones(nb_actions)),
                 memory_limit=int(1e6),
                 verbose=2,
                 tensorboard_log=tblog)

    model.learn(total_timesteps=num_timesteps)
    env.close()

    if model_path:
        model.save(model_path)
        #tf_util.save_state(model_path)

    return model, env