def __init__(self, survive_reward=2e-1, ctrl_cost_coeff=1e-3, contact_cost_coeff=1e-5, velocity_deviation_cost_coeff=1e-2, *args, **kwargs): MultiDirectionBaseEnv.__init__( self, survive_reward=survive_reward, ctrl_cost_coeff=ctrl_cost_coeff, contact_cost_coeff=contact_cost_coeff, velocity_deviation_cost_coeff=velocity_deviation_cost_coeff, *args, **kwargs) HumanoidEnv.__init__( self, # survive_reward=survive_reward, alive_bonus=survive_reward, # TODO: remove this ctrl_cost_coeff=ctrl_cost_coeff, # contact_cost_coeff=contact_cost_coeff, impact_cost_coeff=contact_cost_coeff, # TODO: remove this vel_deviation_cost_coeff= velocity_deviation_cost_coeff, # TODO: remove this *args, **kwargs)
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=(M, M), ) df = DFunction( env_spec=env.spec, hidden_layer_sizes=[M, M]) # discriminator, input is the actions. vf = VFunction(env_spec=env.spec, hidden_layer_sizes=[M, M]) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False, df=df, vf=vf, df_lr=1e-3, dist=variant['dist'], ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) elif variant['env_name'] == 'ant-rllab': env = normalize(AntEnv()) elif variant['env_name'] == 'BlocksSimpleXYQ-v0': target = [-1.0, 0.0] env = bsmp.BlocksSimpleXYQ(multi_goal=variant['blocks_multigoal'], time_limit=variant['max_path_length'], env_config=variant['blocks_simple_xml'], goal=target) env = env_wrap.obsTupleWrap(env, add_action_to_obs=False) env = gym_env.GymEnv( env, video_schedule=glob.video_scheduler.video_schedule, log_dir=".") else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer(env=env, max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env=env, hidden_layer_sizes=(M, M)) policy = StochasticNNPolicy(env=env, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False) algorithm.train()
def run_task(*_): env = normalize(HumanoidEnv()) policy = DeterministicMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) es = OUStrategy(env_spec=env.spec) qf = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=(32, 32)) """ Persistence Length Exploration """ lp = Persistence_Length_Exploration( env=env, qf=qf, policy=policy, L_p=L_p_param[l_p_ind], b_step_size=b_step_size[b_ind], sigma=sigma_param[s_ind], max_exploratory_steps=max_exploratory_steps_iters, batch_size=batch_size_value, n_epochs=num_episodes, scale_reward=0.01, epoch_length=steps_per_episode, qf_learning_rate=0.001, policy_learning_rate=0.0001, ) """ DDPG """ algo = DDPG( env=env, policy=policy, es=es, qf=qf, lp=lp, batch_size=batch_size_value, max_path_length=100, epoch_length=steps_per_episode, min_pool_size=10000, n_epochs=num_episodes, discount=0.99, scale_reward=0.01, qf_learning_rate=0.001, policy_learning_rate=0.0001, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) elif variant['env_name'] == 'ant-rllab': env = normalize(AntEnv()) elif variant['env_name'] == 'sawyer-rllab': env = normalize(SawyerTestEnv()) elif variant['env_name'] == 'arm3Ddisc-rllab': env = normalize(Arm3dDiscEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env_spec=env.spec, hidden_layer_sizes=(M, M)) policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False) algorithm.train()
def get(perm): name = perm["problem"] if name.lower() == "cartpole": from rllab.envs.box2d.cartpole_env import CartpoleEnv return normalize(CartpoleEnv()) elif name.lower() == "mountain car height bonus": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv()) elif name.lower() == "mountain car": from rllab.envs.box2d.mountain_car_env import MountainCarEnv return normalize(MountainCarEnv(height_bonus=0)) elif name.lower() == "gym mountain car": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("MountainCarContinuous-v0", record_video=False)) elif name.lower() == "pendulum": from rllab.envs.gym_env import GymEnv return normalize(GymEnv("Pendulum-v0", record_video=False)) elif name.lower() == "mujoco double pendulum": from rllab.envs.mujoco.inverted_double_pendulum_env import InvertedDoublePendulumEnv return normalize(InvertedDoublePendulumEnv()) elif name.lower() == "double pendulum": from rllab.envs.box2d.double_pendulum_env import DoublePendulumEnv return normalize(DoublePendulumEnv()) elif name.lower() == "hopper": from rllab.envs.mujoco.hopper_env import HopperEnv return normalize(HopperEnv()) elif name.lower() == "swimmer": from rllab.envs.mujoco.swimmer_env import SwimmerEnv return normalize(SwimmerEnv()) elif name.lower() == "2d walker": from rllab.envs.mujoco.walker2d_env import Walker2DEnv return normalize(Walker2DEnv()) elif name.lower() == "half cheetah": from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv return normalize(HalfCheetahEnv()) elif name.lower() == "ant": from rllab.envs.mujoco.ant_env import AntEnv return normalize(AntEnv()) elif name.lower() == "simple humanoid": from rllab.envs.mujoco.simple_humanoid_env import SimpleHumanoidEnv return normalize(SimpleHumanoidEnv()) elif name.lower() == "full humanoid": from rllab.envs.mujoco.humanoid_env import HumanoidEnv return normalize(HumanoidEnv()) else: raise NotImplementedError(f"Environment {name} unknown")
help='eval interval (step)') parser.add_argument('--ckpt-interval', type=int, default=499999, help='checkpoint interval (step)') parser.add_argument('--gpu', type=int, default=4, help='run on CUDA (default: False)') args = parser.parse_args() args.hadamard = bool(args.hadamard) if args.gpu >= 0: print("gpu ok") ptu.set_gpu_mode(True, args.gpu) # set env if args.env_name == 'Humanoidrllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv from rllab.envs.normalized_env import normalize env = normalize(HumanoidEnv()) max_episode_steps = float('inf') if args.seed >= 0: global seed_ seed_ = args.seed else: env = gym.make(args.env_name) max_episode_steps=env._max_episode_steps env=NormalizedActions(env) if args.seed >= 0: env.seed(args.seed) if args.seed >= 0: torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.random.manual_seed(args.seed)
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) env = DelayedEnv(env, delay=0.01) pool = SimpleReplayBuffer( env_spec=env.spec, max_replay_buffer_size=variant['max_pool_size'], ) sampler = RemoteSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size'] ) base_kwargs = dict( sampler=sampler, epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=env.spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reparameterize=variant['reparameterize'], reg=0.001, ) algorithm = SAC( base_kwargs=base_kwargs, env=env, policy=policy, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_reward=variant['scale_reward'], discount=variant['discount'], tau=variant['tau'], reparameterize=variant['reparameterize'], save_full_state=False, ) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) elif variant["env_name"] == "Point2D-v0": import sac.envs.point2d_env env = GymEnv(variant["env_name"]) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict(min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=SimpleSampler( max_path_length=variant["max_path_length"], min_pool_size=variant["max_path_length"], batch_size=variant["batch_size"])) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GaussianPolicy( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], reg=0.001, ) # policy = GMMPolicy( # env_spec=aug_env_spec, # K=variant['K'], # hidden_layer_sizes=[M, M], # qf=qf, # reg=0.001, # ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN(base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], reparametrize=variant["reparametrize"]) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict( min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, ) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GMMPolicy( env_spec=aug_env_spec, K=variant['K'], hidden_layer_sizes=[M, M], qf=qf, reg=0.001, ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN_BD( base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], # Additional params for behaviour tracking metric=variant['metric'], env_id=variant['prefix'], eval_freq=variant['eval_freq'], log_dir=get_logdir(args, variant), ) algorithm.train()